In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

# Statistically based prediction

## T1 vs T2

In [4]:
def build_tx(gender):
    csv_names = ['NCAATourneyDetailedResults', 'RegularSeasonDetailedResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    results_t1 = pd.concat(csvs)
    results_t2 = pd.concat(csvs)
    
    results_t1 = results_t1.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)
    results_t2 = results_t2.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)
        
    results_t1.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(results_t1.columns)]
    results_t2.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(results_t2.columns)]
    
    results = pd.concat([results_t1, results_t2]).reset_index(drop=True)
    results['ScoreDiff'] = results['T1_Score'] - results['T2_Score']

    return results

def build_t1_t2(gender):
    t1 = build_tx(gender)
    t1 = t1.drop(['T1_Score', 'T2_Score', 'T2_TeamID'], axis=1)
    t1 = t1.groupby(by=['Season', 'T1_TeamID']).mean().reset_index()
    t2 = t1.copy()
    
    t1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t1.columns)]
    t2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t2.columns)]
    
    t1 = t1.rename(columns={'T1_TeamID_Avg': 'T1_TeamID', 'T1_Season_Avg': 'Season'})
    t2 = t2.rename(columns={'T2_TeamID_Avg': 'T2_TeamID', 'T2_Season_Avg': 'Season'})
    
    return (t1, t2)

In [5]:
ref = build_tx('M')

ref

Unnamed: 0,Season,T1_TeamID,T1_Score,T2_TeamID,T2_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_FTM,...,T2_FTM,T2_FTA,T2_OR,T2_DR,T2_Ast,T2_TO,T2_Stl,T2_Blk,T2_PF,ScoreDiff
0,2003,1421,92,1411,84,32,69,11,29,17,...,14,31,17,28,16,15,5,0,22,8
1,2003,1112,80,1436,51,31,66,7,23,11,...,7,7,8,26,12,17,10,3,15,29
2,2003,1113,84,1272,71,31,59,6,14,16,...,14,21,20,22,11,12,2,5,18,13
3,2003,1141,79,1166,73,29,53,3,7,18,...,12,17,14,17,20,21,6,6,21,6
4,2003,1143,76,1301,74,27,64,7,20,15,...,15,20,10,26,16,14,5,8,19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226259,2024,1201,65,1424,67,23,49,5,21,14,...,18,23,8,22,8,12,3,3,15,-2
226260,2024,1461,76,1429,84,28,67,9,25,11,...,15,21,4,23,11,10,4,8,17,-8
226261,2024,1156,68,1454,71,26,70,7,23,9,...,12,20,11,31,13,18,3,4,16,-3
226262,2024,1273,60,1459,73,22,53,8,19,8,...,7,11,6,21,17,6,6,1,13,-13


In [6]:
t1_m, t2_m = build_t1_t2('M')

display(t1_m)
display(t2_m)

Unnamed: 0,Season,T1_TeamID,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,...,T1_opponent_FTM_Avg,T1_opponent_FTA_Avg,T1_opponent_OR_Avg,T1_opponent_DR_Avg,T1_opponent_Ast_Avg,T1_opponent_TO_Avg,T1_opponent_Stl_Avg,T1_opponent_Blk_Avg,T1_opponent_PF_Avg,T1_ScoreDiff_Avg
0,2003,1102,19.142857,39.785714,7.821429,20.821429,11.142857,17.107143,4.178571,16.821429,...,13.678571,19.250000,9.607143,20.142857,9.142857,12.964286,5.428571,1.571429,18.357143,0.250000
1,2003,1103,27.148148,55.851852,5.444444,16.074074,19.037037,25.851852,9.777778,19.925926,...,15.925926,22.148148,12.037037,22.037037,15.481481,15.333333,6.407407,2.851852,22.444444,0.629630
2,2003,1104,23.965517,57.000000,6.310345,19.586207,14.793103,20.758621,13.413793,23.793103,...,12.482759,17.448276,10.965517,22.620690,11.793103,13.655172,5.379310,3.137931,19.172414,3.965517
3,2003,1105,24.384615,61.615385,7.576923,20.769231,15.423077,21.846154,13.500000,23.115385,...,16.384615,24.500000,13.192308,26.384615,15.807692,18.807692,9.384615,4.192308,19.076923,-4.884615
4,2003,1106,23.428571,55.285714,6.107143,17.642857,10.642857,16.464286,12.285714,23.857143,...,15.535714,21.964286,11.321429,22.357143,11.785714,15.071429,8.785714,3.178571,16.142857,-0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,2024,1474,27.080000,62.640000,8.160000,25.480000,14.800000,20.960000,8.480000,23.480000,...,14.200000,19.440000,9.480000,25.800000,12.920000,10.880000,6.480000,4.800000,17.840000,-6.960000
7613,2024,1475,22.090909,55.318182,6.409091,19.636364,15.863636,21.636364,8.545455,25.000000,...,19.681818,26.909091,8.045455,23.454545,12.318182,11.090909,7.454545,3.318182,19.000000,-8.409091
7614,2024,1476,22.680000,57.880000,8.360000,27.360000,9.320000,13.200000,6.720000,22.600000,...,13.920000,18.520000,10.200000,26.960000,17.120000,11.680000,8.040000,4.280000,14.080000,-15.120000
7615,2024,1477,23.136364,59.590909,8.500000,28.454545,11.363636,16.500000,7.454545,20.363636,...,17.272727,23.545455,9.272727,26.863636,13.363636,12.454545,5.681818,3.545455,14.863636,-10.409091


Unnamed: 0,Season,T2_TeamID,T2_FGM_Avg,T2_FGA_Avg,T2_FGM3_Avg,T2_FGA3_Avg,T2_FTM_Avg,T2_FTA_Avg,T2_OR_Avg,T2_DR_Avg,...,T2_opponent_FTM_Avg,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg
0,2003,1102,19.142857,39.785714,7.821429,20.821429,11.142857,17.107143,4.178571,16.821429,...,13.678571,19.250000,9.607143,20.142857,9.142857,12.964286,5.428571,1.571429,18.357143,0.250000
1,2003,1103,27.148148,55.851852,5.444444,16.074074,19.037037,25.851852,9.777778,19.925926,...,15.925926,22.148148,12.037037,22.037037,15.481481,15.333333,6.407407,2.851852,22.444444,0.629630
2,2003,1104,23.965517,57.000000,6.310345,19.586207,14.793103,20.758621,13.413793,23.793103,...,12.482759,17.448276,10.965517,22.620690,11.793103,13.655172,5.379310,3.137931,19.172414,3.965517
3,2003,1105,24.384615,61.615385,7.576923,20.769231,15.423077,21.846154,13.500000,23.115385,...,16.384615,24.500000,13.192308,26.384615,15.807692,18.807692,9.384615,4.192308,19.076923,-4.884615
4,2003,1106,23.428571,55.285714,6.107143,17.642857,10.642857,16.464286,12.285714,23.857143,...,15.535714,21.964286,11.321429,22.357143,11.785714,15.071429,8.785714,3.178571,16.142857,-0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,2024,1474,27.080000,62.640000,8.160000,25.480000,14.800000,20.960000,8.480000,23.480000,...,14.200000,19.440000,9.480000,25.800000,12.920000,10.880000,6.480000,4.800000,17.840000,-6.960000
7613,2024,1475,22.090909,55.318182,6.409091,19.636364,15.863636,21.636364,8.545455,25.000000,...,19.681818,26.909091,8.045455,23.454545,12.318182,11.090909,7.454545,3.318182,19.000000,-8.409091
7614,2024,1476,22.680000,57.880000,8.360000,27.360000,9.320000,13.200000,6.720000,22.600000,...,13.920000,18.520000,10.200000,26.960000,17.120000,11.680000,8.040000,4.280000,14.080000,-15.120000
7615,2024,1477,23.136364,59.590909,8.500000,28.454545,11.363636,16.500000,7.454545,20.363636,...,17.272727,23.545455,9.272727,26.863636,13.363636,12.454545,5.681818,3.545455,14.863636,-10.409091


In [7]:
def build_tourney(gender):
    tourney = build_tx(gender)[['Season', 'T1_TeamID', 'T2_TeamID' , 'ScoreDiff']]
    t1, t2  = build_t1_t2(gender)
    
    tourney = pd.merge(tourney, t1, on=['Season', 'T1_TeamID'], how='left')
    tourney = pd.merge(tourney, t2, on=['Season', 'T2_TeamID'], how='left')
    tourney = tourney.groupby(by=['T1_TeamID', 'T2_TeamID']).mean()
    toruney = tourney.drop('Season', axis=1)
    
    return tourney

In [8]:
tourney_m = build_tourney('M')
tourney_w = build_tourney('W')

display(tourney_m)
display(tourney_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Season,ScoreDiff,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,...,T2_opponent_FTM_Avg,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,2018.0,4.0,25.518519,56.481481,6.481481,19.962963,13.148148,18.666667,9.037037,23.222222,...,12.965517,18.137931,9.172414,26.034483,14.586207,13.965517,5.586207,3.586207,18.724138,-6.034483
1101,1115,2015.0,8.0,21.428571,52.928571,7.250000,19.178571,10.892857,14.964286,8.250000,19.642857,...,16.250000,24.156250,12.250000,23.250000,13.000000,16.187500,7.500000,2.718750,17.062500,-7.125000
1101,1116,2022.5,-11.5,25.353636,59.185455,6.470909,18.569091,15.126364,21.245455,8.797273,22.140909,...,14.197917,19.875000,8.531250,23.895833,12.703125,12.729167,6.494792,3.250000,13.817708,3.833333
1101,1117,2018.5,3.5,25.292593,55.824074,6.824074,19.498148,13.324074,18.716667,8.951852,22.911111,...,16.600000,22.833333,10.300000,25.516667,14.050000,12.750000,5.816667,3.666667,19.300000,-6.250000
1101,1122,2021.0,8.0,26.480000,59.280000,7.760000,21.320000,13.480000,19.400000,9.640000,22.600000,...,12.120000,17.800000,8.080000,21.520000,11.800000,15.120000,7.720000,2.520000,4.080000,1.560000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1384,2024.0,37.0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,...,13.727273,19.590909,9.363636,22.863636,13.681818,11.045455,7.136364,4.272727,16.727273,-10.318182
1478,1437,2024.0,-26.0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,...,11.333333,15.791667,6.875000,23.291667,13.166667,10.541667,5.666667,4.166667,15.000000,5.916667
1478,1447,2024.0,-23.0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,...,10.700000,15.600000,7.800000,24.700000,14.050000,9.950000,4.750000,3.250000,13.750000,-2.400000
1478,1467,2024.0,-10.0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,...,13.347826,18.913043,11.304348,24.478261,13.478261,15.478261,7.086957,3.521739,16.652174,2.000000


Unnamed: 0_level_0,Unnamed: 1_level_0,Season,ScoreDiff,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,...,T2_opponent_FTM_Avg,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,2015.0,36.0,21.720000,57.600000,7.280000,26.040000,14.320000,20.440000,15.320000,24.560000,...,14.966667,21.933333,16.000000,31.666667,17.666667,14.400000,9.133333,4.333333,17.266667,-23.500000
3101,3106,2023.0,11.0,25.555556,58.000000,8.296296,24.444444,10.444444,15.111111,9.666667,22.000000,...,12.900000,20.033333,12.233333,21.700000,12.500000,15.566667,8.800000,4.300000,19.066667,-7.500000
3101,3114,2022.0,5.0,24.269231,57.423077,8.730769,25.076923,13.653846,18.192308,9.653846,20.076923,...,13.000000,18.360000,8.480000,23.000000,9.760000,16.520000,6.080000,2.440000,16.840000,1.280000
3101,3116,2018.5,-13.0,23.893899,54.519231,8.144562,24.625332,12.529178,16.830239,11.941645,23.077586,...,12.447343,17.360057,13.474383,30.582068,14.612903,15.834440,5.896584,3.833966,16.904649,-1.771347
3101,3117,2022.0,12.0,24.269231,57.423077,8.730769,25.076923,13.653846,18.192308,9.653846,20.076923,...,14.520000,21.200000,11.000000,25.520000,15.680000,15.680000,7.720000,2.960000,18.480000,0.040000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3425,2024.0,-51.0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,...,11.272727,14.818182,7.818182,22.227273,12.500000,15.500000,6.000000,2.545455,18.272727,13.636364
3478,3433,2024.0,-23.0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,...,9.458333,13.166667,6.833333,18.750000,9.166667,13.416667,6.375000,2.541667,17.083333,9.875000
3478,3447,2024.0,11.0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,...,12.950000,18.850000,11.550000,23.700000,15.600000,17.100000,9.450000,2.800000,17.700000,-15.300000
3478,3467,2024.0,11.0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,...,12.608696,18.086957,10.000000,22.347826,12.043478,16.217391,8.782609,3.782609,18.130435,-4.130435


# Model building

In [9]:
def score_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': 'cpu',
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=5, show_progress_bar=True)

    return study.best_params

In [10]:
def build_x_y(df):
    target_column = 'ScoreDiff'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [11]:
X_m, y_m = build_x_y(tourney_m)
X_w, y_w = build_x_y(tourney_w)

In [12]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [13]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))

In [14]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14014
[LightGBM] [Info] Number of data points in the train set: 31827, number of used features: 55
[LightGBM] [Info] Start training from score 0.013193
LightGBM Model accuracy score: 0.4686
LightGBM Model accuracy score [train]: 0.4759
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14006
[LightGBM] [Info] Number of data points in the train set: 22110, number of used features: 55
[LightGBM] [Info] Start training from score 0.129375
LightGBM Model accuracy score: 0.5901
LightGBM Model accuracy score [train]: 0.6023


# Prediction

In [15]:
def build_proba(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)
    
    pred = reg.predict(X)
    proba = X
    proba['Probability'] = (pred - pred.min()) / (pred.max() - pred.min())

    return proba['Probability']

In [16]:
proba_m = build_proba(X_m, y_m, params_m)
proba_w = build_proba(X_w, y_w, params_w)

display(proba_m)
display(proba_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14013
[LightGBM] [Info] Number of data points in the train set: 45468, number of used features: 55
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14008
[LightGBM] [Info] Number of data points in the train set: 31586, number of used features: 55


T1_TeamID  T2_TeamID
1101       1102         0.546031
           1115         0.440166
           1116         0.490534
           1117         0.662004
           1122         0.709149
                          ...   
1478       1384         0.613374
           1437         0.349942
           1447         0.499256
           1467         0.464676
           1476         0.679698
Name: Probability, Length: 45468, dtype: float64

T1_TeamID  T2_TeamID
3101       3102         0.693942
           3106         0.639185
           3114         0.504462
           3116         0.508657
           3117         0.506559
                          ...   
3478       3425         0.136967
           3433         0.287094
           3447         0.584575
           3467         0.477992
           3476         0.617015
Name: Probability, Length: 31586, dtype: float64

In [17]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [18]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [19]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [20]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [21]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, proba, debug):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - proba (DF): DF that includes wins prediction per matchup.
    - debug (dict): Debug info.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        winner = None
        
        try:
            team_1_prob = proba.loc[team_1, team_2]
            
            debug['found'] += 1
            winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])
        except KeyError:
            debug['miss'] += 1
            winner = np.random.choice([team_1, team_2])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        debug['count'] += 1
        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, proba, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []
    
    debug = {
        'count': 0,
        'miss': 0,
        'found': 0
    }

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, proba, debug)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)
        
    print("Found %: {}%".format(debug['found'] * 100.0 / debug['count']))
    print("Not found %: {}%".format(debug['miss'] * 100.0 / debug['count']))
    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [22]:
n_brackets = 5000
result_m = run_simulation(seeds_2024_m, slots_m, proba_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, proba_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 5000/5000 [00:20<00:00, 245.31it/s]


Found %: 78.44603174603175%
Not found %: 21.553968253968254%


100%|██████████| 5000/5000 [00:20<00:00, 245.27it/s]

Found %: 71.51269841269841%
Not found %: 28.487301587301587%





In [23]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W12
...,...,...,...,...
629995,W,5000,R4Y1,Y12
629996,W,5000,R4Z1,Z13
629997,W,5000,R5WX,X01
629998,W,5000,R5YZ,Y12


In [24]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- https://www.kaggle.com/code/rustyb/paris-madness-2023
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.