In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

# Statistically based prediction

## T1 vs T2

In [4]:
def build_tx(gender):
    """
    Build the DF that includes T1 vs T2 and T2 vs T1 matchups.
    Concat two exact same DFs, but in one replace:
    - W => T1
    - L => T2
    and in the other one:
    - W => T2
    - L => T1
    """
    csv_names = ['NCAATourneyDetailedResults', 'RegularSeasonDetailedResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))

    results_t1 = pd.concat(csvs)
    results_t1 = results_t1.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)
    
    results_t2 = results_t1.copy()
        
    results_t1.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(results_t1.columns)]
    results_t2.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(results_t2.columns)]
    
    results = pd.concat([results_t1, results_t2]).reset_index(drop=True)
    results['ScoreDiff'] = results['T1_Score'] - results['T2_Score']
    results['Win'] = np.where(results['ScoreDiff'] > 0, 1, 0)

    return results

def build_t1_t2(gender):
    """
    Generate two DFs:
    - One preffixed by T1_
    - One preffixed by T2_
    """

    t1 = build_tx(gender)
    t1 = t1.drop(['T1_Score', 'T2_Score', 'T2_TeamID'], axis=1)
    t1 = t1.groupby(by=['Season', 'T1_TeamID']).mean()
    t2 = t1.copy()
    t2.index.names = ['Season', 'T2_TeamID']

    t1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t1.columns)]
    t2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t2.columns)]
    
    return (t1, t2)

In [5]:
def build_matchups(gender):
    """
    Generate a matchup DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    teams = CSV["{}Teams".format(gender)]
    teams['T1_TeamID'] = teams['TeamID']
    teams['T2_TeamID'] = [teams['TeamID'].values.tolist() for i in teams.index]
    teams = teams.explode('T2_TeamID')
    teams = teams.groupby(['T1_TeamID', 'T2_TeamID']).sum()
    teams = teams.reset_index()
    teams = teams[['T1_TeamID', 'T2_TeamID']]
    teams = teams[teams['T1_TeamID'] != teams['T2_TeamID']]
    teams = teams.set_index(['T1_TeamID', 'T2_TeamID'])

    return teams

In [6]:
def build_fill(gender):
    """
    Build a DF with the same format as DF, used to fill NaN matchup values with overall AVGs.
    """
    matchups = build_matchups(gender)
    t1, t2   = build_t1_t2(gender)
    
    t1 = t1.groupby('T1_TeamID').mean()
    t2 = t2.groupby('T2_TeamID').mean()
    
    matchups = matchups.join(t1, on=['T1_TeamID'], how='inner')
    matchups = matchups.join(t2, on=['T2_TeamID'], how='inner')
    
    return matchups

In [7]:
def build_tourney(gender):
    """
    Generate a tourney DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    tourney  = build_tx(gender)[['Season', 'T1_TeamID', 'T2_TeamID' , 'Win']]
    t1, t2   = build_t1_t2(gender)
    matchups = build_matchups(gender)
    fill     = build_fill(gender)
    
    tourney = pd.merge(matchups, tourney, on=['T1_TeamID', 'T2_TeamID'], how='left')
    tourney = pd.merge(tourney, t1, on=['Season', 'T1_TeamID'], how='left')
    tourney = pd.merge(tourney, t2, on=['Season', 'T2_TeamID'], how='left')
    tourney = tourney.groupby(by=['T1_TeamID', 'T2_TeamID']).mean()
    tourney = tourney.drop('Season', axis=1)
    tourney = tourney.fillna(fill)
    
    return tourney

In [8]:
def build_train_tourney(gender):
    """
    Build a train and tourney dataset, Train includes the Win column while Tourney may have NaNs.
    """
    tourney = build_tourney(gender)

    train = tourney[tourney['Win'].notnull()]
    train['Win'] = train['Win'].astype(int)
    
    return (train, tourney)

In [9]:
train_m, tourney_m = build_train_tourney('M')
train_w, tourney_w = build_train_tourney('W')

display(train_m)
display(tourney_m)
display(train_w)
display(tourney_w)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Win'] = train['Win'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Win'] = train['Win'].astype(int)


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1,25.518519,56.481481,6.481481,19.962963,13.148148,18.666667,9.037037,23.222222,13.888889,...,18.137931,9.172414,26.034483,14.586207,13.965517,5.586207,3.586207,18.724138,-6.034483,0.344828
1101,1115,1,21.428571,52.928571,7.250000,19.178571,10.892857,14.964286,8.250000,19.642857,11.857143,...,24.156250,12.250000,23.250000,13.000000,16.187500,7.500000,2.718750,17.062500,-7.125000,0.375000
1101,1116,0,25.353636,59.185455,6.470909,18.569091,15.126364,21.245455,8.797273,22.140909,14.390909,...,19.875000,8.531250,23.895833,12.703125,12.729167,6.494792,3.250000,13.817708,3.833333,0.640625
1101,1117,0,25.292593,55.824074,6.824074,19.498148,13.324074,18.716667,8.951852,22.911111,14.161111,...,22.833333,10.300000,25.516667,14.050000,12.750000,5.816667,3.666667,19.300000,-6.250000,0.333333
1101,1122,1,26.480000,59.280000,7.760000,21.320000,13.480000,19.400000,9.640000,22.600000,17.600000,...,17.800000,8.080000,21.520000,11.800000,15.120000,7.720000,2.520000,4.080000,1.560000,0.480000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1384,1,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,19.590909,9.363636,22.863636,13.681818,11.045455,7.136364,4.272727,16.727273,-10.318182,0.272727
1478,1437,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,15.791667,6.875000,23.291667,13.166667,10.541667,5.666667,4.166667,15.000000,5.916667,0.541667
1478,1447,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,15.600000,7.800000,24.700000,14.050000,9.950000,4.750000,3.250000,13.750000,-2.400000,0.450000
1478,1467,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,18.913043,11.304348,24.478261,13.478261,15.478261,7.086957,3.521739,16.652174,2.000000,0.565217


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1.0,25.518519,56.481481,6.481481,19.962963,13.148148,18.666667,9.037037,23.222222,13.888889,...,18.137931,9.172414,26.034483,14.586207,13.965517,5.586207,3.586207,18.724138,-6.034483,0.344828
1101,1103,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,19.477682,10.811157,22.914088,11.718818,13.376760,6.207280,2.720456,18.263107,4.756395,0.647229
1101,1104,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,19.628514,11.391395,22.776503,11.733135,13.030160,6.455794,3.669305,18.168998,4.540364,0.602638
1101,1105,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,22.729740,12.171504,25.796000,13.277825,14.762687,7.748847,3.745060,18.209819,-6.540094,0.319781
1101,1106,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,22.405766,10.944926,24.684299,12.345102,14.026895,7.166614,3.548025,18.176961,-4.119900,0.403574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,20.475705,8.764107,25.474138,13.789185,11.738245,6.797022,3.335423,17.122257,-9.496865,0.251567
1478,1474,,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,17.453333,8.690000,24.883333,13.293333,10.606667,6.473333,4.483333,17.670000,-3.430000,0.410000
1478,1475,,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,25.126959,7.798589,23.658307,12.538401,11.304075,6.796238,3.797022,18.758621,-4.670063,0.315047
1478,1476,1.0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,18.520000,10.200000,26.960000,17.120000,11.680000,8.040000,4.280000,14.080000,-15.120000,0.080000


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,1,21.720000,57.600000,7.280000,26.040000,14.320000,20.440000,15.320000,24.560000,13.040000,...,21.933333,16.000000,31.666667,17.666667,14.400000,9.133333,4.333333,17.266667,-23.500000,0.066667
3101,3106,1,25.555556,58.000000,8.296296,24.444444,10.444444,15.111111,9.666667,22.000000,13.925926,...,20.033333,12.233333,21.700000,12.500000,15.566667,8.800000,4.300000,19.066667,-7.500000,0.500000
3101,3114,1,24.269231,57.423077,8.730769,25.076923,13.653846,18.192308,9.653846,20.076923,15.076923,...,18.360000,8.480000,23.000000,9.760000,16.520000,6.080000,2.440000,16.840000,1.280000,0.600000
3101,3116,0,23.893899,54.519231,8.144562,24.625332,12.529178,16.830239,11.941645,23.077586,14.740053,...,17.360057,13.474383,30.582068,14.612903,15.834440,5.896584,3.833966,16.904649,-1.771347,0.503795
3101,3117,1,24.269231,57.423077,8.730769,25.076923,13.653846,18.192308,9.653846,20.076923,15.076923,...,21.200000,11.000000,25.520000,15.680000,15.680000,7.720000,2.960000,18.480000,0.040000,0.360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3425,0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,14.818182,7.818182,22.227273,12.500000,15.500000,6.000000,2.545455,18.272727,13.636364,0.818182
3478,3433,0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,13.166667,6.833333,18.750000,9.166667,13.416667,6.375000,2.541667,17.083333,9.875000,0.833333
3478,3447,1,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,18.850000,11.550000,23.700000,15.600000,17.100000,9.450000,2.800000,17.700000,-15.300000,0.250000
3478,3467,1,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,18.086957,10.000000,22.347826,12.043478,16.217391,8.782609,3.782609,18.130435,-4.130435,0.347826


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,1.0,21.720000,57.600000,7.280000,26.040000,14.320000,20.440000,15.320000,24.560000,13.040000,...,21.933333,16.000000,31.666667,17.666667,14.400000,9.133333,4.333333,17.266667,-23.500000,0.066667
3101,3103,,24.106831,56.689811,8.243338,25.274662,13.325945,18.507925,11.928763,23.537225,14.138603,...,18.244050,12.255627,25.311055,13.357218,15.505805,8.000830,2.873861,17.908876,0.000095,0.501941
3101,3104,,24.106831,56.689811,8.243338,25.274662,13.325945,18.507925,11.928763,23.537225,14.138603,...,17.449215,12.279338,25.188613,12.507329,17.362126,7.600154,3.829241,18.453844,2.548561,0.526016
3101,3105,,24.106831,56.689811,8.243338,25.274662,13.325945,18.507925,11.928763,23.537225,14.138603,...,20.983834,12.532968,24.624642,12.461237,16.515246,9.081460,3.347263,18.116089,-5.771821,0.383722
3101,3106,1.0,25.555556,58.000000,8.296296,24.444444,10.444444,15.111111,9.666667,22.000000,13.925926,...,20.033333,12.233333,21.700000,12.500000,15.566667,8.800000,4.300000,19.066667,-7.500000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,18.547619,10.081905,23.400952,14.202857,15.383810,8.733333,2.602857,17.485714,-13.200000,0.139048
3478,3474,,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,25.140110,9.640110,26.595238,13.891026,16.590659,7.121795,3.244505,17.670330,-14.283883,0.167582
3478,3475,,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,19.873077,9.759615,22.401923,12.153846,15.778846,9.726923,3.096154,18.338462,-0.155769,0.523077
3478,3476,1.0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,14.347826,11.434783,24.739130,14.217391,14.043478,11.565217,3.434783,16.608696,-17.695652,0.130435


# Model building

In [10]:
def score_dataset(lgbm_params, X, y):
    classifier = lgb.LGBMClassifier(**lgbm_params, class_weight='balanced')
    score      = cross_val_score(classifier, X, y, scoring='neg_brier_score')
    score      = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': 'cpu',
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=50, show_progress_bar=True)

    return study.best_params

In [11]:
def build_x_y(df):
    target_column = 'Win'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [12]:
X_m, y_m = build_x_y(train_m)
X_w, y_w = build_x_y(train_w)

In [13]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [14]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    clf_test = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(clf_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(clf_test.score(X_train, y_train)))

In [15]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Number of positive: 10860, number of negative: 20967
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039697 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14280
[LightGBM] [Info] Number of data points in the train set: 31827, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LightGBM Model accuracy score: 0.7800
LightGBM Model accuracy score [train]: 0.7961
[LightGBM] [Info] Number of positive: 7237, number of negative: 14873
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14280
[LightGBM] [Info] Number of data points in the train set: 22110, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

# Prediction

In [16]:
def build_proba(X, y, tourney, params):
    clf = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf.fit(X, y)
    
    tourney = tourney.drop('Win', axis=1)
    results = tourney

    pred = clf.predict_proba(results)
    results['Probability'] = list(map(lambda x: x[1], pred))

    return results['Probability']

In [17]:
proba_m = build_proba(X_m, y_m, tourney_m, params_m)
proba_w = build_proba(X_w, y_w, tourney_w, params_w)

display(proba_m)
display(proba_w)

[LightGBM] [Info] Number of positive: 15410, number of negative: 30058
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14280
[LightGBM] [Info] Number of data points in the train set: 45468, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 10375, number of negative: 21211
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14280
[LightGBM] [Info] Number of data points in the train set: 31586, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


T1_TeamID  T2_TeamID
1101       1102         0.436352
           1103         0.149712
           1104         0.144192
           1105         0.482132
           1106         0.274666
                          ...   
1478       1473         0.662140
           1474         0.513471
           1475         0.561145
           1476         0.784775
           1477         0.606464
Name: Probability, Length: 142506, dtype: float64

T1_TeamID  T2_TeamID
3101       3102         0.827413
           3103         0.324658
           3104         0.288112
           3105         0.627772
           3106         0.695935
                          ...   
3478       3473         0.735133
           3474         0.710742
           3475         0.317290
           3476         0.802926
           3477         0.346844
Name: Probability, Length: 141000, dtype: float64

In [18]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [19]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [20]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [21]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [22]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, proba):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - proba (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        
        team_1_prob = proba.loc[team_1, team_2]
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, proba, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, proba)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [23]:
n_brackets = 50000
result_m = run_simulation(seeds_2024_m, slots_m, proba_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, proba_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 50000/50000 [04:48<00:00, 173.11it/s]
100%|██████████| 50000/50000 [04:48<00:00, 173.32it/s]


In [24]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W12
...,...,...,...,...
6299995,W,50000,R4Y1,Y03
6299996,W,50000,R4Z1,Z09
6299997,W,50000,R5WX,X05
6299998,W,50000,R5YZ,Z09


In [25]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- https://www.kaggle.com/code/rustyb/paris-madness-2023
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.