In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna as op
import os
import pandas as pd
import seaborn as sns

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

## T1 vs T2

In [4]:
STAT_COLS = ['Score', 'FGM', 'FGA', 'FGA3', 'FGM3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']

def build_tx(gender):
    """
    Build the DF that includes T1 vs T2 and T2 vs T1 matchups.
    Concat two exact same DFs, but in one replace:
    - W => T1
    - L => T2
    and in the other one:
    - W => T2
    - L => T1
    """
    csv_names = ['NCAATourneyDetailedResults', 'RegularSeasonDetailedResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x].copy(), csv_names))

    results_t1 = pd.concat(csvs)
    results_t1 = results_t1.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)
    results_t2 = results_t1.copy()
        
    results_t1.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(results_t1.columns)]
    results_t2.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(results_t2.columns)]
    
    for df in [results_t1, results_t2]:
        for col in STAT_COLS:
            df["{}Diff".format(col)] = df["T1_{}".format(col)] - df["T2_{}".format(col)]
            
    drop_cols = np.array(list(map(lambda x: ["T1_{}".format(x), "T2_{}".format(x)], STAT_COLS))).flatten()
    for df in [results_t1, results_t2]:
        df.drop(drop_cols, axis=1, inplace=True)
    
    results = pd.concat([results_t1, results_t2]).reset_index(drop=True)
    results['Win'] = np.where(results['ScoreDiff'] > 0, 1, 0)

    return results

def build_t1_t2(gender):
    """
    Generate two DFs:
    - One preffixed by T1_
    - One preffixed by T2_
    """

    t1 = build_tx(gender)
    t1 = t1.drop('T2_TeamID', axis=1)
    t1 = t1.groupby(by=['Season', 'T1_TeamID']).mean()
    t2 = t1.copy()
    t2.index.names = ['Season', 'T2_TeamID']

    t1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t1.columns)]
    t2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t2.columns)]
    
    return (t1, t2)

def build_matchups(gender):
    """
    Generate a matchup DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    teams = CSV["{}Teams".format(gender)].copy()
    teams['T1_TeamID'] = teams['TeamID']
    teams['T2_TeamID'] = [teams['TeamID'].values.tolist() for i in teams.index]
    teams = teams.explode('T2_TeamID')
    teams = teams.groupby(['T1_TeamID', 'T2_TeamID']).sum()
    teams = teams.reset_index()
    teams = teams[['T1_TeamID', 'T2_TeamID']]
    teams = teams[teams['T1_TeamID'] != teams['T2_TeamID']]
    teams = teams.set_index(['T1_TeamID', 'T2_TeamID'])

    return teams

In [5]:
def build_fill(gender):
    """
    Build a DF with the same format as DF, used to fill NaN matchup values with overall AVGs.
    """
    matchups = build_matchups(gender)
    t1, t2   = build_t1_t2(gender)
    
    t1 = t1.groupby('T1_TeamID').mean()
    t2 = t2.groupby('T2_TeamID').mean()
    
    matchups = matchups.join(t1, on=['T1_TeamID'], how='inner')
    matchups = matchups.join(t2, on=['T2_TeamID'], how='inner')
    
    return matchups

In [6]:
def build_tourney(gender):
    """
    Generate a tourney DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    tourney  = build_tx(gender)[['Season', 'T1_TeamID', 'T2_TeamID' , 'Win']]
    t1, t2   = build_t1_t2(gender)
    matchups = build_matchups(gender)
    fill     = build_fill(gender)
    
    tourney = pd.merge(matchups, tourney, on=['T1_TeamID', 'T2_TeamID'], how='left')
    tourney = pd.merge(tourney, t1, on=['Season', 'T1_TeamID'], how='left')
    tourney = pd.merge(tourney, t2, on=['Season', 'T2_TeamID'], how='left')
    tourney = tourney.groupby(by=['T1_TeamID', 'T2_TeamID']).mean()
    tourney = tourney.drop('Season', axis=1)
    tourney = tourney.fillna(fill)
    
    return tourney

In [7]:
def build_train_tourney(gender):
    """
    Build a train and tourney dataset, Train includes the Win column while Tourney may have NaNs.
    """
    tourney = build_tourney(gender)

    train = tourney[tourney['Win'].notnull()]
    train['Win'] = train['Win'].astype(int)
    
    return (train, tourney)

In [8]:
train_m, tourney_m = build_train_tourney('M')
train_w, tourney_w = build_train_tourney('W')

display(train_m)
display(tourney_m)
display(train_w)
display(tourney_w)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Win'] = train['Win'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Win'] = train['Win'].astype(int)


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_ScoreDiff_Avg,T1_FGMDiff_Avg,T1_FGADiff_Avg,T1_FGA3Diff_Avg,T1_FGM3Diff_Avg,T1_FTMDiff_Avg,T1_FTADiff_Avg,T1_ORDiff_Avg,T1_DRDiff_Avg,...,T2_FTMDiff_Avg,T2_FTADiff_Avg,T2_ORDiff_Avg,T2_DRDiff_Avg,T2_AstDiff_Avg,T2_TODiff_Avg,T2_StlDiff_Avg,T2_BlkDiff_Avg,T2_PFDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1,-2.148148,0.740741,2.185185,0.222222,-0.148148,-3.481481,-4.777778,-0.888889,-1.814815,...,0.275862,-0.206897,0.655172,-3.344828,-0.689655,-1.206897,0.862069,-0.965517,-0.586207,0.344828
1101,1115,1,-10.857143,-3.607143,2.071429,2.464286,1.071429,-4.714286,-7.750000,-1.785714,-6.250000,...,-3.843750,-5.906250,-2.093750,-4.468750,-2.906250,-0.906250,1.343750,0.656250,2.406250,0.375000
1101,1116,0,3.729091,1.711818,5.332727,0.726364,0.805455,-0.500000,-1.160909,0.170000,-3.174545,...,2.859375,3.557292,0.505208,1.166667,0.208333,-0.744792,0.369792,2.348958,-1.416667,0.640625
1101,1117,0,1.642593,1.537037,3.025926,1.127778,0.675926,-2.107407,-2.822222,-0.511111,-1.590741,...,-1.266667,-1.700000,1.700000,-2.166667,-3.350000,0.783333,0.033333,-0.266667,0.766667,0.333333
1101,1122,1,12.640000,5.560000,8.120000,3.680000,2.520000,-1.000000,-1.640000,1.840000,-0.440000,...,1.080000,-0.360000,3.520000,-1.560000,0.800000,-0.960000,-1.320000,-0.640000,0.320000,0.480000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1384,1,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,-2.545455,-1.727273,0.772727,-2.454545,-1.045455,2.409091,-1.272727,-0.545455,0.454545,0.272727
1478,1437,0,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,2.250000,0.833333,1.041667,0.875000,-0.500000,-1.333333,0.083333,-1.916667,0.625000,0.541667
1478,1447,0,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,-0.150000,-0.450000,0.900000,-2.850000,-1.550000,-1.150000,1.300000,-1.200000,1.800000,0.450000
1478,1467,0,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,0.565217,0.608696,-4.869565,-2.608696,-1.652174,-3.043478,2.695652,0.130435,0.217391,0.565217


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_ScoreDiff_Avg,T1_FGMDiff_Avg,T1_FGADiff_Avg,T1_FGA3Diff_Avg,T1_FGM3Diff_Avg,T1_FTMDiff_Avg,T1_FTADiff_Avg,T1_ORDiff_Avg,T1_DRDiff_Avg,...,T2_FTMDiff_Avg,T2_FTADiff_Avg,T2_ORDiff_Avg,T2_DRDiff_Avg,T2_AstDiff_Avg,T2_TODiff_Avg,T2_StlDiff_Avg,T2_BlkDiff_Avg,T2_PFDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1.0,-2.148148,0.740741,2.185185,0.222222,-0.148148,-3.481481,-4.777778,-0.888889,-1.814815,...,0.275862,-0.206897,0.655172,-3.344828,-0.689655,-1.206897,0.862069,-0.965517,-0.586207,0.344828
1101,1103,,-1.971915,-0.261118,3.391589,1.732689,0.673995,-2.123673,-3.326520,-0.617847,-3.471457,...,0.610742,0.816912,-0.318072,0.621924,1.846376,-0.836567,0.090596,0.638046,-0.327118,0.647229
1101,1104,,-1.971915,-0.261118,3.391589,1.732689,0.673995,-2.123673,-3.326520,-0.617847,-3.471457,...,1.271599,1.656728,0.110334,2.221690,1.000534,0.303001,0.271184,0.814513,-0.834161,0.602638
1101,1105,,-1.971915,-0.261118,3.391589,1.732689,0.673995,-2.123673,-3.326520,-0.617847,-3.471457,...,-0.921445,-1.175278,-0.900209,-2.612128,-1.877620,0.360330,-0.826393,-0.160479,0.714274,0.319781
1101,1106,,-1.971915,-0.261118,3.391589,1.732689,0.673995,-2.123673,-3.326520,-0.617847,-3.471457,...,-1.650826,-1.596786,0.959609,-1.507053,-1.215522,0.555386,-0.673183,-0.186138,0.623175,0.403574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,-0.614420,-1.616771,-1.736677,-3.253135,-4.618339,0.596395,-0.826019,-0.423981,0.402821,0.251567
1478,1474,,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,2.383333,3.593333,0.183333,-0.976667,-0.580000,1.063333,-0.166667,-1.910000,-0.803333,0.410000
1478,1475,,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,-3.357367,-4.084639,1.646552,1.255486,-0.997649,1.851881,-0.536834,-1.318182,1.409875,0.315047
1478,1476,1.0,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,-4.600000,-5.320000,-3.480000,-4.360000,-5.640000,1.760000,-1.200000,-2.000000,1.720000,0.080000


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_ScoreDiff_Avg,T1_FGMDiff_Avg,T1_FGADiff_Avg,T1_FGA3Diff_Avg,T1_FGM3Diff_Avg,T1_FTMDiff_Avg,T1_FTADiff_Avg,T1_ORDiff_Avg,T1_DRDiff_Avg,...,T2_FTMDiff_Avg,T2_FTADiff_Avg,T2_ORDiff_Avg,T2_DRDiff_Avg,T2_AstDiff_Avg,T2_TODiff_Avg,T2_StlDiff_Avg,T2_BlkDiff_Avg,T2_PFDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,1,2.400000,-0.720000,-0.280000,7.400000,1.440000,2.400000,3.280000,1.520000,0.120000,...,-3.333333,-5.166667,-7.066667,-8.300000,-5.100000,2.700000,-2.566667,-2.433333,1.666667,0.066667
3101,3106,1,2.333333,0.555556,0.111111,5.481481,2.333333,-1.111111,-0.814815,0.407407,1.333333,...,1.900000,0.500000,-3.500000,-1.233333,-1.166667,1.800000,-1.566667,-0.900000,-1.233333,0.500000
3101,3114,1,0.923077,-2.153846,-0.923077,4.730769,1.769231,3.461538,3.692308,-0.038462,-1.269231,...,-3.640000,-4.560000,1.000000,-0.720000,3.360000,-1.680000,1.360000,0.760000,0.920000,0.600000
3101,3116,0,0.622679,0.037135,-2.770557,6.275199,2.145889,-1.597480,-3.102785,-0.896552,2.641247,...,-0.147059,0.125237,-1.312144,-5.722011,-2.889469,-4.292694,1.800759,-0.199241,-0.468216,0.503795
3101,3117,1,0.923077,-2.153846,-0.923077,4.730769,1.769231,3.461538,3.692308,-0.038462,-1.269231,...,0.320000,-0.680000,0.640000,-1.560000,-2.000000,-0.520000,0.640000,-1.400000,1.440000,0.360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3425,0,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,2.227273,3.590909,1.863636,0.454545,2.681818,-3.909091,2.409091,3.136364,-2.545455,0.818182
3478,3433,0,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,3.333333,4.791667,3.875000,4.708333,1.166667,1.250000,1.416667,-0.041667,-1.791667,0.833333
3478,3447,1,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,-1.450000,-1.650000,-2.950000,-4.100000,-6.850000,2.400000,-0.750000,-0.450000,1.500000,0.250000
3478,3467,1,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,-2.173913,-2.695652,1.565217,0.434783,-1.130435,1.217391,-0.695652,-0.956522,1.782609,0.347826


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_ScoreDiff_Avg,T1_FGMDiff_Avg,T1_FGADiff_Avg,T1_FGA3Diff_Avg,T1_FGM3Diff_Avg,T1_FTMDiff_Avg,T1_FTADiff_Avg,T1_ORDiff_Avg,T1_DRDiff_Avg,...,T2_FTMDiff_Avg,T2_FTADiff_Avg,T2_ORDiff_Avg,T2_DRDiff_Avg,T2_AstDiff_Avg,T2_TODiff_Avg,T2_StlDiff_Avg,T2_BlkDiff_Avg,T2_PFDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,1.0,2.400000,-0.720000,-0.280000,7.400000,1.440000,2.400000,3.280000,1.520000,0.120000,...,-3.333333,-5.166667,-7.066667,-8.300000,-5.100000,2.700000,-2.566667,-2.433333,1.666667,0.066667
3101,3103,,2.816111,-0.374313,-2.217280,6.474400,2.365108,1.199630,1.189362,-0.061943,1.995264,...,0.038527,-0.569442,-1.406384,0.455952,0.909066,0.500817,-1.193005,0.071791,-1.097636,0.501941
3101,3104,,2.816111,-0.374313,-2.217280,6.474400,2.365108,1.199630,1.189362,-0.061943,1.995264,...,1.589303,2.502254,0.338207,-0.285495,-0.510218,-1.193773,0.743539,-0.325337,-1.371572,0.526016
3101,3105,,2.816111,-0.374313,-2.217280,6.474400,2.365108,1.199630,1.189362,-0.061943,1.995264,...,-0.981104,-1.336458,0.104754,-0.591981,-1.684919,1.766067,-1.603929,-0.458485,0.133792,0.383722
3101,3106,1.0,2.333333,0.555556,0.111111,5.481481,2.333333,-1.111111,-0.814815,0.407407,1.333333,...,1.900000,0.500000,-3.500000,-1.233333,-1.166667,1.800000,-1.566667,-0.900000,-1.233333,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,-3.345714,-2.943810,-2.354286,-5.114286,-2.477143,1.137143,-1.454286,-0.571429,1.081905,0.139048
3478,3474,,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,-5.656593,-7.156593,-0.218864,-6.183150,-6.788462,-0.489927,0.779304,-0.904762,4.640110,0.167582
3478,3475,,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,-1.263462,-2.576923,-0.634615,2.882692,0.871154,2.357692,-2.407692,-0.517308,0.655769,0.523077
3478,3476,1.0,-8.956522,-3.956522,-0.478261,-4.521739,-2.260870,1.217391,0.217391,-1.913043,-3.869565,...,-0.130435,-1.434783,-2.043478,-3.173913,-3.391304,5.391304,-7.130435,-1.304348,-2.304348,0.130435


### Feature analysis

In [9]:
train_m

Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_ScoreDiff_Avg,T1_FGMDiff_Avg,T1_FGADiff_Avg,T1_FGA3Diff_Avg,T1_FGM3Diff_Avg,T1_FTMDiff_Avg,T1_FTADiff_Avg,T1_ORDiff_Avg,T1_DRDiff_Avg,...,T2_FTMDiff_Avg,T2_FTADiff_Avg,T2_ORDiff_Avg,T2_DRDiff_Avg,T2_AstDiff_Avg,T2_TODiff_Avg,T2_StlDiff_Avg,T2_BlkDiff_Avg,T2_PFDiff_Avg,T2_Win_Avg
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1,-2.148148,0.740741,2.185185,0.222222,-0.148148,-3.481481,-4.777778,-0.888889,-1.814815,...,0.275862,-0.206897,0.655172,-3.344828,-0.689655,-1.206897,0.862069,-0.965517,-0.586207,0.344828
1101,1115,1,-10.857143,-3.607143,2.071429,2.464286,1.071429,-4.714286,-7.750000,-1.785714,-6.250000,...,-3.843750,-5.906250,-2.093750,-4.468750,-2.906250,-0.906250,1.343750,0.656250,2.406250,0.375000
1101,1116,0,3.729091,1.711818,5.332727,0.726364,0.805455,-0.500000,-1.160909,0.170000,-3.174545,...,2.859375,3.557292,0.505208,1.166667,0.208333,-0.744792,0.369792,2.348958,-1.416667,0.640625
1101,1117,0,1.642593,1.537037,3.025926,1.127778,0.675926,-2.107407,-2.822222,-0.511111,-1.590741,...,-1.266667,-1.700000,1.700000,-2.166667,-3.350000,0.783333,0.033333,-0.266667,0.766667,0.333333
1101,1122,1,12.640000,5.560000,8.120000,3.680000,2.520000,-1.000000,-1.640000,1.840000,-0.440000,...,1.080000,-0.360000,3.520000,-1.560000,0.800000,-0.960000,-1.320000,-0.640000,0.320000,0.480000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1384,1,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,-2.545455,-1.727273,0.772727,-2.454545,-1.045455,2.409091,-1.272727,-0.545455,0.454545,0.272727
1478,1437,0,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,2.250000,0.833333,1.041667,0.875000,-0.500000,-1.333333,0.083333,-1.916667,0.625000,0.541667
1478,1447,0,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,-0.150000,-0.450000,0.900000,-2.850000,-1.550000,-1.150000,1.300000,-1.200000,1.800000,0.450000
1478,1467,0,-3.619048,-1.857143,0.333333,1.809524,1.000000,-0.904762,-2.571429,-2.000000,-3.333333,...,0.565217,0.608696,-4.869565,-2.608696,-1.652174,-3.043478,2.695652,0.130435,0.217391,0.565217


In [10]:
corr = train_m.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Win,T1_ScoreDiff_Avg,T1_FGMDiff_Avg,T1_FGADiff_Avg,T1_FGA3Diff_Avg,T1_FGM3Diff_Avg,T1_FTMDiff_Avg,T1_FTADiff_Avg,T1_ORDiff_Avg,T1_DRDiff_Avg,T1_AstDiff_Avg,T1_TODiff_Avg,T1_StlDiff_Avg,T1_BlkDiff_Avg,T1_PFDiff_Avg,T1_Win_Avg,T2_ScoreDiff_Avg,T2_FGMDiff_Avg,T2_FGADiff_Avg,T2_FGA3Diff_Avg,T2_FGM3Diff_Avg,T2_FTMDiff_Avg,T2_FTADiff_Avg,T2_ORDiff_Avg,T2_DRDiff_Avg,T2_AstDiff_Avg,T2_TODiff_Avg,T2_StlDiff_Avg,T2_BlkDiff_Avg,T2_PFDiff_Avg,T2_Win_Avg
Win,1.0,0.374823,0.334843,0.058333,0.024714,0.1365,0.250307,0.240965,0.115199,0.285419,0.30576,-0.178193,0.161417,0.203639,-0.222126,0.359697,-0.342953,-0.307191,-0.053596,-0.026025,-0.12904,-0.225029,-0.216696,-0.106452,-0.262886,-0.281543,0.160434,-0.144005,-0.181864,0.199338,-0.327168
T1_ScoreDiff_Avg,0.374823,1.0,0.911575,0.218228,0.130479,0.434933,0.591305,0.553313,0.320562,0.74005,0.823449,-0.490176,0.465474,0.473525,-0.494303,0.953538,0.052441,0.045828,0.001359,-0.004735,0.012209,0.04099,0.038101,0.012037,0.042589,0.034521,-0.023394,0.026487,0.038441,-0.035558,0.067342
T1_FGMDiff_Avg,0.334843,0.911575,1.0,0.422664,0.001616,0.279364,0.276887,0.263212,0.382772,0.673991,0.794647,-0.501034,0.50503,0.47065,-0.212384,0.852533,0.045828,0.031829,-0.010928,0.001246,0.017164,0.04811,0.045357,0.005799,0.0378,0.028262,-0.016178,0.01737,0.033047,-0.042078,0.059882
T1_FGADiff_Avg,0.058333,0.218228,0.422664,1.0,0.254519,0.185146,-0.359692,-0.363847,0.593004,-0.215997,0.168712,-0.657398,0.599911,-0.082345,0.399139,0.170009,0.001359,-0.010928,-0.049188,-0.004496,0.002803,0.023266,0.023148,-0.028309,0.018304,0.00167,0.028981,-0.024621,0.018039,-0.02392,0.006092
T1_FGA3Diff_Avg,0.024714,0.130479,0.001616,0.254519,1.0,0.889083,-0.175065,-0.211147,-0.017227,-0.224127,0.222854,-0.242331,0.149693,0.023882,0.156186,0.109488,-0.004735,0.001246,-0.004496,-0.04651,-0.041789,0.009313,0.010751,0.008487,0.008856,-0.011221,0.006236,0.000527,-0.00062,-0.008144,-0.003637
T1_FGM3Diff_Avg,0.1365,0.434933,0.279364,0.185146,0.889083,1.0,0.016971,-0.037384,-0.028508,0.072625,0.480429,-0.299244,0.21109,0.147931,-0.004925,0.408237,0.012209,0.017164,0.002803,-0.041789,-0.037747,0.020334,0.021711,0.019232,0.021086,-6.1e-05,-0.003492,0.01115,0.011664,-0.017218,0.017481
T1_FTMDiff_Avg,0.250307,0.591305,0.276887,-0.359692,-0.175065,0.016971,1.0,0.958511,0.109926,0.582891,0.33034,-0.134336,0.112742,0.237445,-0.878539,0.600147,0.04099,0.04811,0.023266,0.009313,0.020334,0.001823,-0.001174,0.009122,0.025698,0.035382,-0.02774,0.029096,0.029545,-0.001153,0.049583
T1_FTADiff_Avg,0.240965,0.553313,0.263212,-0.363847,-0.211147,-0.037384,0.958511,1.0,0.165663,0.561576,0.30664,-0.1081,0.102499,0.249869,-0.912039,0.562995,0.038101,0.045357,0.023148,0.010751,0.021711,-0.001174,-0.008486,0.001272,0.021239,0.032251,-0.029283,0.029029,0.026899,0.004937,0.046349
T1_ORDiff_Avg,0.115199,0.320562,0.382772,0.593004,-0.017227,-0.028508,0.109926,0.165663,1.0,0.335972,0.213947,-0.109206,0.142103,-0.007209,-0.081903,0.281544,0.012037,0.005799,-0.028309,0.008487,0.019232,0.009122,0.001272,-0.053491,0.000875,0.011441,-0.008597,0.005943,0.019156,-0.004093,0.018806
T1_DRDiff_Avg,0.285419,0.74005,0.673991,-0.215997,-0.224127,0.072625,0.582891,0.561576,0.335972,1.0,0.643864,0.174126,-0.09262,0.453677,-0.506643,0.707943,0.042589,0.0378,0.018304,0.008856,0.021086,0.025698,0.021239,0.000875,0.012893,0.027185,-0.045016,0.040681,0.020938,-0.01861,0.053816


In [11]:
corr = train_m.corr()['Win'].sort_values(ascending=False)
high_corr = corr[[abs(corr) > 0.3 for corr in corr]]

display(high_corr)

Win                 1.000000
T1_ScoreDiff_Avg    0.374823
T1_Win_Avg          0.359697
T1_FGMDiff_Avg      0.334843
T1_AstDiff_Avg      0.305760
T2_FGMDiff_Avg     -0.307191
T2_Win_Avg         -0.327168
T2_ScoreDiff_Avg   -0.342953
Name: Win, dtype: float64

In [12]:
features = high_corr.index.tolist()[1:]

features

['T1_ScoreDiff_Avg',
 'T1_Win_Avg',
 'T1_FGMDiff_Avg',
 'T1_AstDiff_Avg',
 'T2_FGMDiff_Avg',
 'T2_Win_Avg',
 'T2_ScoreDiff_Avg']

# Model building

In [13]:
def score_dataset(lgbm_params, X, y):
    classifier = lgb.LGBMClassifier(**lgbm_params, class_weight='balanced')
    score      = cross_val_score(classifier, X, y, scoring='neg_brier_score')
    score      = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': 'cpu',
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=10, show_progress_bar=True)

    return study.best_params

In [14]:
def build_x_y(df, features):
    target_column = 'Win'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [15]:
X_m, y_m = build_x_y(train_m, features)
X_w, y_w = build_x_y(train_w, features)

In [16]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [17]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    clf_test = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(clf_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(clf_test.score(X_train, y_train)))

In [18]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Number of positive: 10795, number of negative: 21032
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 31827, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Model accuracy score: 0.7735
LightGBM Model accuracy score [train]: 0.7834
[LightGBM] [Info] Number of positive: 7250, number of negative: 14860
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 22110, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> inits

# Prediction

In [19]:
def build_proba(X, y, tourney, params):
    clf = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf.fit(X, y)
    
    tourney = tourney.drop('Win', axis=1)
    results = tourney

    pred = clf.predict_proba(results)
    results['Probability'] = list(map(lambda x: x[1], pred))

    return results['Probability']

In [20]:
proba_m = build_proba(X_m, y_m, tourney_m, params_m)
proba_w = build_proba(X_w, y_w, tourney_w, params_w)

display(proba_m)
display(proba_w)

[LightGBM] [Info] Number of positive: 15410, number of negative: 30058
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 45468, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 10375, number of negative: 21211
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 31586, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


T1_TeamID  T2_TeamID
1101       1102         0.597260
           1103         0.189942
           1104         0.188311
           1105         0.574913
           1106         0.396506
                          ...   
1478       1473         0.600997
           1474         0.319016
           1475         0.420274
           1476         0.759745
           1477         0.534735
Name: Probability, Length: 142506, dtype: float64

T1_TeamID  T2_TeamID
3101       3102         0.806546
           3103         0.397074
           3104         0.379069
           3105         0.522223
           3106         0.545982
                          ...   
3478       3473         0.707696
           3474         0.693566
           3475         0.313446
           3476         0.755884
           3477         0.380864
Name: Probability, Length: 141000, dtype: float64

In [21]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [22]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [23]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [24]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [25]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, proba):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - proba (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        
        team_1_prob = proba.loc[team_1, team_2]
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, proba, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, proba)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [26]:
n_brackets = 10000
result_m = run_simulation(seeds_2024_m, slots_m, proba_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, proba_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 10000/10000 [00:40<00:00, 247.95it/s]
100%|██████████| 10000/10000 [00:40<00:00, 245.66it/s]


In [27]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W15
2,M,1,R1W3,W14
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
1259995,W,10000,R4Y1,Y03
1259996,W,10000,R4Z1,Z02
1259997,W,10000,R5WX,W10
1259998,W,10000,R5YZ,Z02


In [28]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- https://www.kaggle.com/code/rustyb/paris-madness-2023
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.