In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

# Statistically based prediction

## T1 vs T2

In [4]:
def build_tx(gender):
    """
    Build the DF that includes T1 vs T2 and T2 vs T1 matchups.
    Concat two exact same DFs, but in one replace:
    - W => T1
    - L => T2
    and in the other one:
    - W => T2
    - L => T1
    """
    csv_names = ['NCAATourneyDetailedResults', 'RegularSeasonDetailedResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    results_t1 = pd.concat(csvs)
    results_t1 = results_t1.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)
    
    results_t2 = results_t1.copy()
        
    results_t1.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(results_t1.columns)]
    results_t2.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(results_t2.columns)]
    
    results = pd.concat([results_t1, results_t2]).reset_index(drop=True)
    results['ScoreDiff'] = results['T1_Score'] - results['T2_Score']

    return results

def build_t1_t2(gender):
    """
    Generate two DFs:
    - One preffixed by T1_
    - One preffixed by T2_
    """
    t1 = build_tx(gender)
    t1 = t1.drop(['T1_Score', 'T2_Score', 'T2_TeamID'], axis=1)
    t1 = t1.groupby(by=['Season', 'T1_TeamID']).mean().reset_index()
    t2 = t1.copy()
    
    t1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t1.columns)]
    t2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t2.columns)]
    
    t1 = t1.rename(columns={'T1_TeamID_Avg': 'T1_TeamID', 'T1_Season_Avg': 'Season'})
    t2 = t2.rename(columns={'T2_TeamID_Avg': 'T2_TeamID', 'T2_Season_Avg': 'Season'})
    
    return (t1, t2)

In [5]:
ref = build_tx('M')

ref

Unnamed: 0,Season,T1_TeamID,T1_Score,T2_TeamID,T2_Score,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_FTM,...,T2_FTM,T2_FTA,T2_OR,T2_DR,T2_Ast,T2_TO,T2_Stl,T2_Blk,T2_PF,ScoreDiff
0,2003,1421,92,1411,84,32,69,11,29,17,...,14,31,17,28,16,15,5,0,22,8
1,2003,1112,80,1436,51,31,66,7,23,11,...,7,7,8,26,12,17,10,3,15,29
2,2003,1113,84,1272,71,31,59,6,14,16,...,14,21,20,22,11,12,2,5,18,13
3,2003,1141,79,1166,73,29,53,3,7,18,...,12,17,14,17,20,21,6,6,21,6
4,2003,1143,76,1301,74,27,64,7,20,15,...,15,20,10,26,16,14,5,8,19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226259,2024,1201,65,1424,67,23,49,5,21,14,...,18,23,8,22,8,12,3,3,15,-2
226260,2024,1461,76,1429,84,28,67,9,25,11,...,15,21,4,23,11,10,4,8,17,-8
226261,2024,1156,68,1454,71,26,70,7,23,9,...,12,20,11,31,13,18,3,4,16,-3
226262,2024,1273,60,1459,73,22,53,8,19,8,...,7,11,6,21,17,6,6,1,13,-13


In [6]:
t1_m, t2_m = build_t1_t2('M')

display(t1_m)
display(t2_m)

Unnamed: 0,Season,T1_TeamID,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,...,T1_opponent_FTM_Avg,T1_opponent_FTA_Avg,T1_opponent_OR_Avg,T1_opponent_DR_Avg,T1_opponent_Ast_Avg,T1_opponent_TO_Avg,T1_opponent_Stl_Avg,T1_opponent_Blk_Avg,T1_opponent_PF_Avg,T1_ScoreDiff_Avg
0,2003,1102,19.142857,39.785714,7.821429,20.821429,11.142857,17.107143,4.178571,16.821429,...,13.678571,19.250000,9.607143,20.142857,9.142857,12.964286,5.428571,1.571429,18.357143,0.250000
1,2003,1103,27.148148,55.851852,5.444444,16.074074,19.037037,25.851852,9.777778,19.925926,...,15.925926,22.148148,12.037037,22.037037,15.481481,15.333333,6.407407,2.851852,22.444444,0.629630
2,2003,1104,23.965517,57.000000,6.310345,19.586207,14.793103,20.758621,13.413793,23.793103,...,12.482759,17.448276,10.965517,22.620690,11.793103,13.655172,5.379310,3.137931,19.172414,3.965517
3,2003,1105,24.384615,61.615385,7.576923,20.769231,15.423077,21.846154,13.500000,23.115385,...,16.384615,24.500000,13.192308,26.384615,15.807692,18.807692,9.384615,4.192308,19.076923,-4.884615
4,2003,1106,23.428571,55.285714,6.107143,17.642857,10.642857,16.464286,12.285714,23.857143,...,15.535714,21.964286,11.321429,22.357143,11.785714,15.071429,8.785714,3.178571,16.142857,-0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,2024,1474,27.080000,62.640000,8.160000,25.480000,14.800000,20.960000,8.480000,23.480000,...,14.200000,19.440000,9.480000,25.800000,12.920000,10.880000,6.480000,4.800000,17.840000,-6.960000
7613,2024,1475,22.090909,55.318182,6.409091,19.636364,15.863636,21.636364,8.545455,25.000000,...,19.681818,26.909091,8.045455,23.454545,12.318182,11.090909,7.454545,3.318182,19.000000,-8.409091
7614,2024,1476,22.680000,57.880000,8.360000,27.360000,9.320000,13.200000,6.720000,22.600000,...,13.920000,18.520000,10.200000,26.960000,17.120000,11.680000,8.040000,4.280000,14.080000,-15.120000
7615,2024,1477,23.136364,59.590909,8.500000,28.454545,11.363636,16.500000,7.454545,20.363636,...,17.272727,23.545455,9.272727,26.863636,13.363636,12.454545,5.681818,3.545455,14.863636,-10.409091


Unnamed: 0,Season,T2_TeamID,T2_FGM_Avg,T2_FGA_Avg,T2_FGM3_Avg,T2_FGA3_Avg,T2_FTM_Avg,T2_FTA_Avg,T2_OR_Avg,T2_DR_Avg,...,T2_opponent_FTM_Avg,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg
0,2003,1102,19.142857,39.785714,7.821429,20.821429,11.142857,17.107143,4.178571,16.821429,...,13.678571,19.250000,9.607143,20.142857,9.142857,12.964286,5.428571,1.571429,18.357143,0.250000
1,2003,1103,27.148148,55.851852,5.444444,16.074074,19.037037,25.851852,9.777778,19.925926,...,15.925926,22.148148,12.037037,22.037037,15.481481,15.333333,6.407407,2.851852,22.444444,0.629630
2,2003,1104,23.965517,57.000000,6.310345,19.586207,14.793103,20.758621,13.413793,23.793103,...,12.482759,17.448276,10.965517,22.620690,11.793103,13.655172,5.379310,3.137931,19.172414,3.965517
3,2003,1105,24.384615,61.615385,7.576923,20.769231,15.423077,21.846154,13.500000,23.115385,...,16.384615,24.500000,13.192308,26.384615,15.807692,18.807692,9.384615,4.192308,19.076923,-4.884615
4,2003,1106,23.428571,55.285714,6.107143,17.642857,10.642857,16.464286,12.285714,23.857143,...,15.535714,21.964286,11.321429,22.357143,11.785714,15.071429,8.785714,3.178571,16.142857,-0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,2024,1474,27.080000,62.640000,8.160000,25.480000,14.800000,20.960000,8.480000,23.480000,...,14.200000,19.440000,9.480000,25.800000,12.920000,10.880000,6.480000,4.800000,17.840000,-6.960000
7613,2024,1475,22.090909,55.318182,6.409091,19.636364,15.863636,21.636364,8.545455,25.000000,...,19.681818,26.909091,8.045455,23.454545,12.318182,11.090909,7.454545,3.318182,19.000000,-8.409091
7614,2024,1476,22.680000,57.880000,8.360000,27.360000,9.320000,13.200000,6.720000,22.600000,...,13.920000,18.520000,10.200000,26.960000,17.120000,11.680000,8.040000,4.280000,14.080000,-15.120000
7615,2024,1477,23.136364,59.590909,8.500000,28.454545,11.363636,16.500000,7.454545,20.363636,...,17.272727,23.545455,9.272727,26.863636,13.363636,12.454545,5.681818,3.545455,14.863636,-10.409091


In [7]:
def build_matchups(gender):
    """
    Generate a matchup DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    teams = CSV["{}Teams".format(gender)]
    teams['T1_TeamID'] = teams['TeamID']
    teams['T2_TeamID'] = [teams['TeamID'].values.tolist() for i in teams.index]
    teams = teams.explode('T2_TeamID')
    teams = teams.groupby(['T1_TeamID', 'T2_TeamID']).sum()
    teams = teams.reset_index()
    teams = teams[['T1_TeamID', 'T2_TeamID']]
    teams = teams[teams['T1_TeamID'] != teams['T2_TeamID']]

    return teams

In [8]:
matchups_m = build_matchups('M')
matchups_w = build_matchups('W')

display(matchups_m)
display(matchups_w)

Unnamed: 0,T1_TeamID,T2_TeamID
1,1101,1102
2,1101,1103
3,1101,1104
4,1101,1105
5,1101,1106
...,...,...
142878,1478,1473
142879,1478,1474
142880,1478,1475
142881,1478,1476


Unnamed: 0,T1_TeamID,T2_TeamID
1,3101,3102
2,3101,3103
3,3101,3104
4,3101,3105
5,3101,3106
...,...,...
141370,3478,3473
141371,3478,3474
141372,3478,3475
141373,3478,3476


In [9]:
def build_tourney(gender):
    """
    Generate a tourney DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    matchups = build_matchups(gender)
    tourney  = build_tx(gender)[['Season', 'T1_TeamID', 'T2_TeamID' , 'ScoreDiff']]
    t1, t2   = build_t1_t2(gender)
    
    tourney = pd.merge(matchups, tourney, on=['T1_TeamID', 'T2_TeamID'], how='left')
    tourney = pd.merge(tourney, t1, on=['Season', 'T1_TeamID'], how='left')
    tourney = pd.merge(tourney, t2, on=['Season', 'T2_TeamID'], how='left')
    tourney = tourney.groupby(by=['T1_TeamID', 'T2_TeamID']).mean()
    tourney = tourney.drop('Season', axis=1)
    tourney = tourney.interpolate()
    
    return tourney

In [10]:
tourney_m = build_tourney('M')
tourney_w = build_tourney('W')

display(tourney_m)
display(tourney_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,ScoreDiff,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_FTM_Avg,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,4.000000,25.518519,56.481481,6.481481,19.962963,13.148148,18.666667,9.037037,23.222222,13.888889,...,12.965517,18.137931,9.172414,26.034483,14.586207,13.965517,5.586207,3.586207,18.724138,-6.034483
1101,1103,4.307692,25.203907,56.208181,6.540598,19.902625,12.974664,18.381868,8.976496,22.946886,13.732601,...,13.218170,18.600879,9.409151,25.820292,14.464191,14.136439,5.733422,3.519479,18.596320,-6.118369
1101,1104,4.615385,24.889296,55.934880,6.599715,19.842287,12.801180,18.097070,8.915954,22.671551,13.576313,...,13.470822,19.063826,9.645889,25.606101,14.342175,14.307361,5.880637,3.452752,18.468501,-6.202255
1101,1105,4.923077,24.574685,55.661579,6.658832,19.781950,12.627696,17.812271,8.855413,22.396215,13.420024,...,13.723475,19.526774,9.882626,25.391910,14.220159,14.478282,6.027851,3.386025,18.340683,-6.286141
1101,1106,5.230769,24.260073,55.388278,6.717949,19.721612,12.454212,17.527473,8.794872,22.120879,13.263736,...,13.976127,19.989721,10.119363,25.177719,14.098143,14.649204,6.175066,3.319297,18.212865,-6.370027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,17.333333,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,13.729275,18.651014,10.568116,26.132754,15.906087,12.946087,7.722319,4.027246,14.937391,-9.413333
1478,1474,21.888889,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,13.792850,18.607343,10.445411,26.408502,16.310725,12.524058,7.828213,4.111498,14.651594,-11.315556
1478,1475,26.444444,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,13.856425,18.563671,10.322705,26.684251,16.715362,12.102029,7.934106,4.195749,14.365797,-13.217778
1478,1476,31.000000,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,13.920000,18.520000,10.200000,26.960000,17.120000,11.680000,8.040000,4.280000,14.080000,-15.120000


Unnamed: 0_level_0,Unnamed: 1_level_0,ScoreDiff,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_FTM_Avg,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,36.000000,21.720000,57.6,7.280000,26.040000,14.320000,20.440000,15.320000,24.560000,13.040000,...,14.966667,21.933333,16.000000,31.666667,17.666667,14.400000,9.133333,4.333333,17.266667,-23.500000
3101,3103,29.750000,22.678889,57.7,7.534074,25.641111,13.351111,19.107778,13.906667,23.920000,13.261481,...,14.450000,21.458333,15.058333,29.175000,16.375000,14.691667,9.050000,4.325000,17.716667,-19.500000
3101,3104,23.500000,23.637778,57.8,7.788148,25.242222,12.382222,17.775556,12.493333,23.280000,13.482963,...,13.933333,20.983333,14.116667,26.683333,15.083333,14.983333,8.966667,4.316667,18.166667,-15.500000
3101,3105,17.250000,24.596667,57.9,8.042222,24.843333,11.413333,16.443333,11.080000,22.640000,13.704444,...,13.416667,20.508333,13.175000,24.191667,13.791667,15.275000,8.883333,4.308333,18.616667,-11.500000
3101,3106,11.000000,25.555556,58.0,8.296296,24.444444,10.444444,15.111111,9.666667,22.000000,13.925926,...,12.900000,20.033333,12.233333,21.700000,12.500000,15.566667,8.800000,4.300000,19.066667,-7.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,17.666667,20.260870,57.0,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,10.492754,15.594203,10.956522,23.942029,13.492754,14.768116,10.637681,3.550725,17.115942,-13.173913
3478,3474,18.777778,20.260870,57.0,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,10.140097,15.178744,11.115942,24.207729,13.734300,14.526570,10.946860,3.512077,16.946860,-14.681159
3478,3475,19.888889,20.260870,57.0,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,9.787440,14.763285,11.275362,24.473430,13.975845,14.285024,11.256039,3.473430,16.777778,-16.188406
3478,3476,21.000000,20.260870,57.0,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,9.434783,14.347826,11.434783,24.739130,14.217391,14.043478,11.565217,3.434783,16.608696,-17.695652


# Model building

In [11]:
def score_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': 'cpu',
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=5, show_progress_bar=True)

    return study.best_params

In [12]:
def build_x_y(df):
    target_column = 'ScoreDiff'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [13]:
X_m, y_m = build_x_y(tourney_m)
X_w, y_w = build_x_y(tourney_w)

In [14]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [15]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))

In [16]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13770
[LightGBM] [Info] Number of data points in the train set: 99754, number of used features: 54
[LightGBM] [Info] Start training from score -0.997970
LightGBM Model accuracy score: 0.4743
LightGBM Model accuracy score [train]: 0.4823
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13770
[LightGBM] [Info] Number of data points in the train set: 98700, number of used features: 54
[LightGBM] [Info] Start training from score -1.558012
LightGBM Model accuracy score: 0.5560
LightGBM Model accuracy score [train]: 0.5584


# Prediction

In [17]:
def build_proba(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)
    
    pred = reg.predict(X)
    proba = X
    proba['Probability'] = (pred - pred.min()) / (pred.max() - pred.min())

    return proba['Probability']

In [18]:
proba_m = build_proba(X_m, y_m, params_m)
proba_w = build_proba(X_w, y_w, params_w)

display(proba_m)
display(proba_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.115084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13770
[LightGBM] [Info] Number of data points in the train set: 142506, number of used features: 54
[LightGBM] [Info] Start training from score -1.002260
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.153872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13770
[LightGBM] [Info] Number of data points in the train set: 141000, number of used features: 54
[LightGBM] [Info] Start training from score -1.501870


T1_TeamID  T2_TeamID
1101       1102         0.546204
           1103         0.543631
           1104         0.524747
           1105         0.521981
           1106         0.514706
                          ...   
1478       1473         0.638447
           1474         0.674757
           1475         0.679216
           1476         0.683104
           1477         0.683104
Name: Probability, Length: 142506, dtype: float64

T1_TeamID  T2_TeamID
3101       3102         0.716364
           3103         0.718179
           3104         0.723113
           3105         0.682552
           3106         0.648423
                          ...   
3478       3473         0.538867
           3474         0.545985
           3475         0.545985
           3476         0.545985
           3477         0.545985
Name: Probability, Length: 141000, dtype: float64

In [19]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [20]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [21]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [22]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [23]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, proba):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - proba (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        
        team_1_prob = proba.loc[team_1, team_2]
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, proba, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, proba)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [24]:
n_brackets = 5000
result_m = run_simulation(seeds_2024_m, slots_m, proba_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, proba_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 5000/5000 [00:29<00:00, 168.74it/s]
100%|██████████| 5000/5000 [00:29<00:00, 172.12it/s]


In [25]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
629995,W,5000,R4Y1,Y08
629996,W,5000,R4Z1,Z07
629997,W,5000,R5WX,X09
629998,W,5000,R5YZ,Z07


In [26]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- https://www.kaggle.com/code/rustyb/paris-madness-2023
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.