In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna as op
import os
import pandas as pd
import seaborn as sns

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

## V3

In [4]:
regular_m = pd.concat([CSV['MNCAATourneyCompactResults'], CSV['MRegularSeasonCompactResults']], copy=True)

In [5]:
def build_regular(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x].copy(), csv_names))
    
    return pd.concat(csvs)

In [6]:
def build_points(gender):
    regular = build_regular(gender)
    
    points = regular[['Season', 'WTeamID', 'LTeamID', 'WScore', 'LScore']]
    points['ScoreMargin'] = points['WScore'] - points['LScore']

    win_points = points.drop(['LTeamID'], axis=1)
    win_points = win_points.groupby(['Season', 'WTeamID']).mean().reset_index()
    win_points = win_points.rename(columns={'WTeamID': 'TeamID', 'WScore': 'PtsScoredWinAvg', 'LScore': 'PtsAllowedWinAvg', 'ScoreMargin': 'WinMarginAvg'}).set_index(['Season', 'TeamID'])

    lose_points = points.drop(['WTeamID'], axis=1)
    lose_points = lose_points.groupby(['Season', 'LTeamID']).mean().reset_index()
    lose_points = lose_points.rename(columns={'LTeamID': 'TeamID', 'WScore': 'PtsAllowedLoseAvg', 'LScore': 'PtsScoredLoseAvg', 'ScoreMargin': 'LoseMarginAvg'}).set_index(['Season', 'TeamID'])

    points = pd.merge(win_points, lose_points, on=['Season', 'TeamID'], how='outer').fillna(0)

    return points

In [7]:
def build_win_pct(gender):
    regular = build_regular(gender)

    win_count = regular[['Season', 'WTeamID']]
    win_count['Wins'] = 1
    win_count = win_count.groupby(['Season', 'WTeamID']).count().reset_index()
    win_count = win_count.rename(columns={'WTeamID': 'TeamID'}).set_index(['Season', 'TeamID'])

    lose_count =  regular[['Season', 'LTeamID']]
    lose_count['Loses'] = 1
    lose_count = lose_count.groupby(['Season', 'LTeamID']).count().reset_index()
    lose_count = lose_count.rename(columns={'LTeamID': 'TeamID'}).set_index(['Season', 'TeamID'])

    win_pct = pd.merge(win_count, lose_count, on=['Season', 'TeamID'], how='outer').fillna(0)
    win_pct['Games'] = win_pct['Wins'] + win_pct['Loses']
    win_pct['Games'] = win_pct['Games'].astype(int)
    win_pct['WinPct'] = win_pct['Wins'] * 100.0 / win_pct['Games']
    win_pct = win_pct.drop(['Wins', 'Loses'], axis=1)

    return win_pct

In [8]:
def clean_seeds(seed):
    res = seed[1:]

    if len(res) > 2:
        res = res[:-1]

    return int(res)

def build_seeds(gender):
    seeds = CSV["{}NCAATourneySeeds".format(gender)].copy()
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)

    return seeds

In [9]:
def build_seed_diff(gender):
    regular = build_regular(gender)
    seeds   = build_seeds(gender)

    seed_diff = pd.merge(regular[['Season', 'WTeamID', 'LTeamID']], seeds, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='left').fillna(0)
    seed_diff = seed_diff.drop('TeamID', axis=1)
    seed_diff = seed_diff.rename(columns={'Seed': 'WinSeed'})
    seed_diff = pd.merge(seed_diff, seeds, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], how='left').fillna(0)
    seed_diff = seed_diff.drop('TeamID', axis=1)
    seed_diff = seed_diff.rename(columns={'Seed': 'LoseSeed'})
    seed_diff['SeedDiff'] = (seed_diff['WinSeed'] - seed_diff['LoseSeed']).astype(int)
    seed_diff = seed_diff.drop(['WinSeed', 'LoseSeed'], axis=1)

    w_seed_diff = seed_diff.drop('LTeamID', axis=1)
    w_seed_diff = w_seed_diff.groupby(['Season', 'WTeamID']).mean().reset_index()
    w_seed_diff = w_seed_diff.rename(columns={'WTeamID': 'TeamID', 'SeedDiff': 'SeedDiffWinAvg'}).set_index(['Season', 'TeamID'])

    l_seed_diff = seed_diff.drop('WTeamID', axis=1)
    l_seed_diff = l_seed_diff.groupby(['Season', 'LTeamID']).mean().reset_index()
    l_seed_diff = l_seed_diff.rename(columns={'LTeamID': 'TeamID', 'SeedDiff': 'SeedDiffLoseAvg'}).set_index(['Season', 'TeamID'])

    seed_diff = pd.merge(l_seed_diff, w_seed_diff, on=['Season', 'TeamID'], how='outer').fillna(0)

    return seed_diff

In [10]:
def build_rankings():
    regular = build_regular('M')    
    rankings = CSV['MMasseyOrdinals'].copy()
    rankings = rankings[['Season', 'TeamID', 'OrdinalRank']]
    rankings = rankings.groupby(['Season', 'TeamID']).mean().reset_index()

    rankings_diff = pd.merge(regular[['Season', 'WTeamID', 'LTeamID']], rankings, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='left').fillna(0)
    rankings_diff = rankings_diff.drop('TeamID', axis=1)
    rankings_diff = rankings_diff.rename(columns={'OrdinalRank': 'WinRank'})
    rankings_diff = pd.merge(rankings_diff, rankings, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], how='left').fillna(0)
    rankings_diff = rankings_diff.drop('TeamID', axis=1)
    rankings_diff = rankings_diff.rename(columns={'OrdinalRank': 'LoseRank'})
    rankings_diff['RankingDiff'] = rankings_diff['WinRank'] - rankings_diff['LoseRank']
    rankings_diff = rankings_diff.drop(['WinRank', 'LoseRank'], axis=1)

    w_rank_diff = rankings_diff.drop('LTeamID', axis=1)
    w_rank_diff = w_rank_diff.groupby(['Season', 'WTeamID']).mean().reset_index()
    w_rank_diff = w_rank_diff.rename(columns={'WTeamID': 'TeamID', 'RankingDiff': 'WinRankDiffAvg'}).set_index(['Season', 'TeamID'])

    l_rank_diff = rankings_diff.drop('WTeamID', axis=1)
    l_rank_diff = l_rank_diff.groupby(['Season', 'LTeamID']).mean().reset_index()
    l_rank_diff = l_rank_diff.rename(columns={'LTeamID': 'TeamID', 'RankingDiff': 'LoseRankDiffAvg'}).set_index(['Season', 'TeamID'])

    rank_diff = pd.merge(w_rank_diff, l_rank_diff,  on=['Season', 'TeamID'], how='outer')

    return rank_diff

In [11]:
def build_df(gender):
    df = pd.merge(build_points(gender), build_win_pct(gender), on=['Season', 'TeamID'])
    df = pd.merge(df, build_seed_diff(gender), on=['Season', 'TeamID'])
    
    if gender == 'M':
        df = pd.merge(df, build_rankings(), on=['Season', 'TeamID'])

    return df

In [12]:
def build_matchups(gender):
    """
    Generate a matchup DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    teams = CSV["{}Teams".format(gender)].copy()
    teams['T1_TeamID'] = teams['TeamID']
    teams['T2_TeamID'] = [teams['TeamID'].values.tolist() for i in teams.index]
    teams = teams.explode('T2_TeamID')
    teams = teams.groupby(['T1_TeamID', 'T2_TeamID']).sum()
    teams = teams.reset_index()
    teams = teams[['T1_TeamID', 'T2_TeamID']]
    teams = teams[teams['T1_TeamID'] != teams['T2_TeamID']]
    teams = teams.set_index(['T1_TeamID', 'T2_TeamID'])

    return teams

In [13]:
def build_fill(gender):
    df = build_df(gender).reset_index()
    
    t1 = df.rename(columns={'TeamID': 'T1_TeamID'}).groupby('T1_TeamID').mean().reset_index().drop('Season', axis=1)
    t2 = df.rename(columns={'TeamID': 'T2_TeamID'}).groupby('T2_TeamID').mean().reset_index().drop('Season', axis=1)
    
    matchups = pd.merge(t1, t2, how='cross', suffixes=('_T1', '_T2'))
    matchups = matchups[matchups['T1_TeamID'] != matchups['T2_TeamID']]
    matchups = matchups.groupby(['T1_TeamID', 'T2_TeamID']).mean()
    
    return matchups

In [14]:
def build_train(gender):
    regular  = build_regular(gender)
    df       = build_df(gender)
    matchups = build_matchups(gender)
    fill     = build_fill(gender)
    
    train_w = pd.merge(regular[['Season', 'WTeamID', 'LTeamID']], df, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='inner')
    train_w = pd.merge(train_w, df, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=('_T1', '_T2'), how='inner')
    train_w['Win'] = 1
    train_w = train_w.rename(columns={'WTeamID': 'T1_TeamID', 'LTeamID': 'T2_TeamID'})
    train_w = train_w.groupby(['T1_TeamID', 'T2_TeamID']).mean()
    train_w = train_w.drop('Season', axis=1)

    train_l = pd.merge(regular[['Season', 'WTeamID', 'LTeamID']], df, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='inner')
    train_l = pd.merge(train_l, df, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=('_T2', '_T1'), how='inner')
    train_l['Win'] = 0
    train_l = train_l.rename(columns={'WTeamID': 'T2_TeamID', 'LTeamID': 'T1_TeamID'})
    train_l = train_l.groupby(['T1_TeamID', 'T2_TeamID']).mean()
    train_l = train_l.drop('Season', axis=1)

    train = pd.concat([train_w, train_l])
    train = train.groupby(['T1_TeamID', 'T2_TeamID']).mean()
    train['Win'] = train['Win'].astype(int)

    tourney = pd.merge(matchups, train.drop('Win', axis=1), on=['T1_TeamID', 'T2_TeamID'], how='left', validate='one_to_one')
    tourney = tourney.fillna(fill)

    return (train, tourney)

In [15]:
train_m, tourney_m = build_train('M')
train_w, tourney_w = build_train('W')

display(train_m)
display(tourney_m)
display(train_w)
display(tourney_w)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  points['ScoreMargin'] = points['WScore'] - points['LScore']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  win_count['Wins'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lose_count['Loses'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

Unnamed: 0_level_0,Unnamed: 1_level_0,PtsScoredWinAvg_T1,PtsAllowedWinAvg_T1,WinMarginAvg_T1,PtsAllowedLoseAvg_T1,PtsScoredLoseAvg_T1,LoseMarginAvg_T1,Games_T1,WinPct_T1,SeedDiffLoseAvg_T1,SeedDiffWinAvg_T1,...,PtsAllowedLoseAvg_T2,PtsScoredLoseAvg_T2,LoseMarginAvg_T2,Games_T2,WinPct_T2,SeedDiffLoseAvg_T2,SeedDiffWinAvg_T2,WinRankDiffAvg_T2,LoseRankDiffAvg_T2,Win
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,76.250000,68.916667,7.333333,75.933333,66.200000,9.733333,27.0,44.444444,4.066667,0.000,...,75.473684,61.473684,14.000000,29.0,34.482759,1.526316,0.000000,26.732567,-129.172837,1
1101,1115,77.428571,65.285714,12.142857,74.047619,55.523810,18.523810,28.0,25.000000,1.095238,0.000,...,71.800000,54.200000,17.600000,32.0,37.500000,1.750000,-1.250000,10.602411,-115.131050,1
1101,1116,76.961364,63.881818,13.079545,74.766667,63.866667,10.900000,25.5,61.153846,-5.000000,6.925,...,82.178571,68.928571,13.250000,30.0,64.062500,1.142857,-0.740000,-74.547813,-15.086532,0
1101,1117,74.885870,65.458333,9.427536,76.180952,64.028571,12.152381,28.5,60.555556,-5.109524,7.500,...,81.482456,69.162907,12.319549,30.0,33.333333,2.046366,-0.727273,38.735036,-89.913359,0
1101,1122,77.650000,59.400000,18.250000,70.200000,60.400000,9.800000,25.0,80.000000,-10.000000,13.850,...,78.384615,71.461538,6.923077,25.0,48.000000,2.153846,0.000000,-63.773982,6.034009,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1384,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.0,36.000000,0.000000,0.000,...,75.450000,60.850000,14.600000,26.0,23.076923,0.000000,0.000000,32.071996,-97.385938,1
1478,1437,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.0,36.000000,0.000000,0.000,...,74.500000,65.916667,8.583333,28.0,57.142857,0.000000,0.000000,-81.967964,17.021124,0
1478,1447,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.0,36.000000,0.000000,0.000,...,69.384615,58.153846,11.230769,24.0,45.833333,0.000000,0.000000,-19.998373,-60.153434,0
1478,1467,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.0,36.000000,0.000000,0.000,...,71.900000,60.900000,11.000000,26.0,61.538462,0.000000,0.000000,-83.711648,-56.445427,0


Unnamed: 0_level_0,Unnamed: 1_level_0,PtsScoredWinAvg_T1,PtsAllowedWinAvg_T1,WinMarginAvg_T1,PtsAllowedLoseAvg_T1,PtsScoredLoseAvg_T1,LoseMarginAvg_T1,Games_T1,WinPct_T1,SeedDiffLoseAvg_T1,SeedDiffWinAvg_T1,...,WinMarginAvg_T2,PtsAllowedLoseAvg_T2,PtsScoredLoseAvg_T2,LoseMarginAvg_T2,Games_T2,WinPct_T2,SeedDiffLoseAvg_T2,SeedDiffWinAvg_T2,WinRankDiffAvg_T2,LoseRankDiffAvg_T2
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,76.250000,68.916667,7.333333,75.933333,66.200000,9.733333,27.000000,44.444444,4.066667,0.000000,...,9.100000,75.473684,61.473684,14.000000,29.00,34.482759,1.526316,0.000000,26.732567,-129.172837
1101,1103,76.954168,67.052067,9.902100,76.078860,63.809374,12.269485,26.454545,46.060712,-0.988057,2.394125,...,12.145942,75.245408,65.201738,10.043670,28.70,57.057301,0.588205,0.753568,-47.684345,3.773485
1101,1104,76.954168,67.052067,9.902100,76.078860,63.809374,12.269485,26.454545,46.060712,-0.988057,2.394125,...,13.353805,75.734963,65.762289,9.972674,31.80,62.252951,0.571426,0.869665,-36.618218,-2.464473
1101,1105,76.954168,67.052067,9.902100,76.078860,63.809374,12.269485,26.454545,46.060712,-0.988057,2.394125,...,9.875080,75.616150,61.570149,14.046001,26.92,35.453936,1.449170,0.131826,3.976105,-74.842238
1101,1106,76.954168,67.052067,9.902100,76.078860,63.809374,12.269485,26.454545,46.060712,-0.988057,2.394125,...,9.435446,79.761550,66.209452,13.552098,27.10,42.003902,0.799855,0.903400,-6.357682,-47.767463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.000000,36.000000,0.000000,0.000000,...,6.900000,77.404762,62.023810,15.380952,27.50,23.408488,1.142857,-1.000000,23.289243,-105.413600
1478,1474,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.000000,36.000000,0.000000,0.000000,...,8.083333,83.644444,73.027778,10.616667,29.00,42.857143,0.933333,-1.000000,-8.792207,-50.389593
1478,1475,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.000000,36.000000,0.000000,0.000000,...,8.807692,79.360119,68.633929,10.726190,27.50,32.029178,0.718750,-0.615385,25.712445,-71.386000
1478,1476,80.222222,64.333333,15.888889,78.625000,64.375000,14.250000,25.000000,36.000000,0.000000,0.000000,...,10.666667,79.600000,63.120000,16.480000,28.00,10.714286,0.000000,0.000000,28.117345,-129.519175


Unnamed: 0_level_0,Unnamed: 1_level_0,PtsScoredWinAvg_T1,PtsAllowedWinAvg_T1,WinMarginAvg_T1,PtsAllowedLoseAvg_T1,PtsScoredLoseAvg_T1,LoseMarginAvg_T1,Games_T1,WinPct_T1,SeedDiffLoseAvg_T1,SeedDiffWinAvg_T1,...,PtsAllowedWinAvg_T2,WinMarginAvg_T2,PtsAllowedLoseAvg_T2,PtsScoredLoseAvg_T2,LoseMarginAvg_T2,Games_T2,WinPct_T2,SeedDiffLoseAvg_T2,SeedDiffWinAvg_T2,Win
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,74.615385,58.692308,15.923077,66.916667,54.666667,12.250000,25.0,52.000000,1.250000,0.000000,...,68.500000,5.500000,77.928571,52.357143,25.571429,30.0,6.666667,1.071429,0.000000,1
3101,3106,73.000000,57.833333,15.166667,75.266667,67.333333,7.933333,27.0,44.444444,0.000000,-1.166667,...,63.866667,7.000000,76.533333,54.533333,22.000000,30.0,50.000000,2.733333,-1.066667,1
3101,3114,76.846154,62.153846,14.692308,77.846154,65.000000,12.846154,26.0,50.000000,2.538462,0.000000,...,53.066667,10.400000,67.200000,54.800000,12.400000,25.0,60.000000,4.900000,0.000000,1
3101,3116,77.414474,63.087719,14.326754,75.014286,57.357143,17.657143,27.5,55.835544,-6.300000,7.631579,...,62.705769,13.705769,80.595238,63.757937,16.837302,32.5,50.379507,3.130952,-1.451923,0
3101,3117,76.846154,62.153846,14.692308,77.846154,65.000000,12.846154,26.0,50.000000,2.538462,0.000000,...,68.000000,17.000000,80.250000,70.750000,9.500000,25.0,36.000000,1.750000,-1.555556,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3425,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.0,51.851852,0.000000,0.000000,...,58.952381,18.333333,69.600000,61.600000,8.000000,26.0,80.769231,0.000000,0.000000,0
3478,3433,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.0,51.851852,0.000000,0.000000,...,50.000000,14.041667,68.750000,58.750000,10.000000,28.0,85.714286,0.000000,0.000000,0
3478,3447,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.0,51.851852,0.000000,0.000000,...,60.000000,5.000000,70.947368,50.736842,20.210526,24.0,20.833333,0.000000,0.000000,1
3478,3467,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.0,51.851852,0.000000,0.000000,...,56.909091,13.727273,65.000000,50.933333,14.066667,26.0,42.307692,0.000000,0.000000,1


Unnamed: 0_level_0,Unnamed: 1_level_0,PtsScoredWinAvg_T1,PtsAllowedWinAvg_T1,WinMarginAvg_T1,PtsAllowedLoseAvg_T1,PtsScoredLoseAvg_T1,LoseMarginAvg_T1,Games_T1,WinPct_T1,SeedDiffLoseAvg_T1,SeedDiffWinAvg_T1,PtsScoredWinAvg_T2,PtsAllowedWinAvg_T2,WinMarginAvg_T2,PtsAllowedLoseAvg_T2,PtsScoredLoseAvg_T2,LoseMarginAvg_T2,Games_T2,WinPct_T2,SeedDiffLoseAvg_T2,SeedDiffWinAvg_T2
T1_TeamID,T2_TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3101,3102,74.615385,58.692308,15.923077,66.916667,54.666667,12.250000,25.000000,52.000000,1.250000,0.000000,74.000000,68.500000,5.500000,77.928571,52.357143,25.571429,30.000000,6.666667,1.071429,0.000000
3101,3103,76.358381,62.288288,14.070093,72.932010,60.077489,12.854520,24.909091,57.094913,0.124709,1.090547,72.010614,61.318891,10.691723,74.865492,59.947560,14.917932,28.740741,39.218781,1.414830,0.348672
3101,3104,76.358381,62.288288,14.070093,72.932010,60.077489,12.854520,24.909091,57.094913,0.124709,1.090547,73.829411,57.708211,16.121199,73.327728,58.935481,14.392247,29.777778,50.848420,1.998630,-0.508786
3101,3105,76.358381,62.288288,14.070093,72.932010,60.077489,12.854520,24.909091,57.094913,0.124709,1.090547,64.771946,54.455501,10.316444,70.645333,55.298391,15.346942,27.461538,39.225059,2.042355,-0.582386
3101,3106,73.000000,57.833333,15.166667,75.266667,67.333333,7.933333,27.000000,44.444444,0.000000,-1.166667,70.866667,63.866667,7.000000,76.533333,54.533333,22.000000,30.000000,50.000000,2.733333,-1.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.000000,51.851852,0.000000,0.000000,78.000000,69.583333,8.416667,73.668860,57.945175,15.723684,25.000000,14.000000,1.187500,0.000000
3478,3474,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.000000,51.851852,0.000000,0.000000,67.800000,58.200000,9.600000,75.095238,55.761905,19.333333,25.500000,17.615385,0.833333,0.000000
3478,3475,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.000000,51.851852,0.000000,0.000000,73.083333,59.416667,13.666667,76.274510,58.769608,17.504902,25.000000,54.807692,0.941176,0.000000
3478,3476,64.928571,53.642857,11.285714,72.384615,48.153846,24.230769,27.000000,51.851852,0.000000,0.000000,64.750000,53.250000,11.500000,70.954545,49.500000,21.454545,26.000000,15.384615,0.000000,0.000000


### Feature analysis

In [16]:
corr = train_m.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,PtsScoredWinAvg_T1,PtsAllowedWinAvg_T1,WinMarginAvg_T1,PtsAllowedLoseAvg_T1,PtsScoredLoseAvg_T1,LoseMarginAvg_T1,Games_T1,WinPct_T1,SeedDiffLoseAvg_T1,SeedDiffWinAvg_T1,WinRankDiffAvg_T1,LoseRankDiffAvg_T1,PtsScoredWinAvg_T2,PtsAllowedWinAvg_T2,WinMarginAvg_T2,PtsAllowedLoseAvg_T2,PtsScoredLoseAvg_T2,LoseMarginAvg_T2,Games_T2,WinPct_T2,SeedDiffLoseAvg_T2,SeedDiffWinAvg_T2,WinRankDiffAvg_T2,LoseRankDiffAvg_T2,Win
PtsScoredWinAvg_T1,1.0,0.832533,0.410285,0.617028,0.715626,-0.118684,0.076769,0.247269,-0.056076,0.054836,-0.082011,0.243491,0.189984,0.182173,0.03843,0.20322,0.19625,0.027348,-0.042911,0.033888,-0.042002,0.036996,0.071277,0.097823,0.104411
PtsAllowedWinAvg_T1,0.832533,1.0,-0.163627,0.696434,0.632661,0.16091,-0.1871,-0.12709,0.039609,-0.054403,0.232283,-0.025442,0.182173,0.188552,0.014019,0.195389,0.197222,0.01192,-0.090764,-0.000864,0.00179,-0.007581,0.084011,0.083743,-0.080779
WinMarginAvg_T1,0.410285,-0.163627,1.0,-0.047666,0.2329,-0.476244,0.444716,0.649554,-0.165065,0.187211,-0.510967,0.475494,0.03843,0.014019,0.045359,0.040244,0.024815,0.029079,0.072999,0.061771,-0.077743,0.078362,-0.011361,0.036346,0.318913
PtsAllowedLoseAvg_T1,0.617028,0.696434,-0.047666,1.0,0.832448,0.359032,-0.165718,-0.132187,0.014803,-0.027818,0.180878,-0.015693,0.20322,0.195389,0.040244,0.205156,0.198952,0.026208,-0.088288,0.004262,-0.000177,-0.003328,0.089639,0.092497,-0.061698
PtsScoredLoseAvg_T1,0.715626,0.632661,0.2329,0.832448,1.0,-0.218284,0.055179,0.236936,-0.063857,0.058815,-0.063907,0.284193,0.19625,0.197222,0.024815,0.198952,0.205427,0.004374,-0.057059,0.029926,-0.037924,0.034186,0.086209,0.107949,0.084713
LoseMarginAvg_T1,-0.118684,0.16091,-0.476244,0.359032,-0.218284,1.0,-0.384803,-0.631899,0.133632,-0.14806,0.427377,-0.506329,0.027348,0.01192,0.029079,0.026208,0.004374,0.038789,-0.059384,-0.042901,0.063568,-0.063444,0.012662,-0.018922,-0.251351
Games_T1,0.076769,-0.1871,0.444716,-0.165718,0.055179,-0.384803,1.0,0.547329,-0.256917,0.250654,-0.457902,0.18698,-0.042911,-0.090764,0.072999,-0.088288,-0.057059,-0.059384,0.384287,0.119769,-0.102135,0.106933,-0.204009,-0.086758,0.279685
WinPct_T1,0.247269,-0.12709,0.649554,-0.132187,0.236936,-0.631899,0.547329,1.0,-0.324001,0.359242,-0.646749,0.661375,0.033888,-0.000864,0.061771,0.004262,0.029926,-0.042901,0.119769,0.104635,-0.099427,0.097302,-0.042211,0.059907,0.349928
SeedDiffLoseAvg_T1,-0.056076,0.039609,-0.165065,0.014803,-0.063857,0.133632,-0.256917,-0.324001,1.0,-0.933279,0.176127,-0.171041,-0.042002,0.00179,-0.077743,-0.000177,-0.037924,0.063568,-0.102135,-0.099427,-0.007896,0.011609,0.049939,-0.051576,-0.05429
SeedDiffWinAvg_T1,0.054836,-0.054403,0.187211,-0.027818,0.058815,-0.14806,0.250654,0.359242,-0.933279,1.0,-0.218219,0.202266,0.036996,-0.007581,0.078362,-0.003328,0.034186,-0.063444,0.106933,0.097302,0.011609,-0.003503,-0.038145,0.067242,0.051576


In [17]:
corr = train_m.corr()['Win'].sort_values(ascending=False)
high_corr = corr[[abs(corr) > 0.15 for corr in corr]]

display(high_corr)

Win                   1.000000
WinPct_T1             0.349928
WinMarginAvg_T1       0.318913
Games_T1              0.279685
LoseMarginAvg_T2      0.279558
LoseRankDiffAvg_T1    0.251776
WinRankDiffAvg_T2     0.232001
Games_T2             -0.201123
LoseMarginAvg_T1     -0.251351
WinMarginAvg_T2      -0.270376
LoseRankDiffAvg_T2   -0.277673
WinRankDiffAvg_T1    -0.286740
WinPct_T2            -0.326942
Name: Win, dtype: float64

In [18]:
features = high_corr.index.tolist()[1:]

features

['WinPct_T1',
 'WinMarginAvg_T1',
 'Games_T1',
 'LoseMarginAvg_T2',
 'LoseRankDiffAvg_T1',
 'WinRankDiffAvg_T2',
 'Games_T2',
 'LoseMarginAvg_T1',
 'WinMarginAvg_T2',
 'LoseRankDiffAvg_T2',
 'WinRankDiffAvg_T1',
 'WinPct_T2']

# Model building

In [19]:
def score_dataset(lgbm_params, X, y):
    classifier = lgb.LGBMClassifier(**lgbm_params, class_weight='balanced')
    score      = cross_val_score(classifier, X, y, scoring='neg_brier_score')
    score      = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float("reg_alpha", 1e-6, 1e2, log=True),
        'reg_lambda': trial.suggest_float("reg_lambda", 1e0, 1e3, log=True),
        'num_leaves': 100,
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'learning_rate': trial.suggest_float("learning_rate", 0, 1),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0, 1),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0, 1),
        'n_estimators': 30,
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=10, show_progress_bar=True)

    return study.best_params

In [20]:
def build_x_y(df, features):
    target_column = 'Win'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [21]:
X_m, y_m = build_x_y(train_m, features)
X_w, y_w = build_x_y(train_w, features)

In [22]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 14940, number of negative: 31949
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6091
[LightGBM] [Info] Number of data points in the train set: 46889, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 14940, number of negative: 31949
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6092
[LightGBM] [Info] Number of data points in the train set: 46889, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info

  0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 11219, number of negative: 23957
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5074
[LightGBM] [Info] Number of data points in the train set: 35176, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 11219, number of negative: 23957
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014710 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5079
[LightGBM] [Info] Number of data points in the train set: 35176, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [

In [23]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    clf_test = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(clf_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(clf_test.score(X_train, y_train)))

In [24]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Number of positive: 13038, number of negative: 27990
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6090
[LightGBM] [Info] Number of data points in the train set: 41028, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Model accuracy score: 0.8359
LightGBM Model accuracy score [train]: 0.9257
[LightGBM] [Info] Number of positive: 9791, number of negative: 20988
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5073
[LightGBM] [Info] Number of data points in the train set: 30779, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> inits

# Prediction

In [25]:
def build_proba(X, y, tourney, params):
    clf = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf.fit(X, y)

    pred = clf.predict_proba(tourney)
    tourney['Probability'] = list(map(lambda x: x[1], pred))

    return tourney['Probability']

In [26]:
proba_m = build_proba(X_m, y_m, tourney_m, params_m)
proba_w = build_proba(X_w, y_w, tourney_w, params_w)

display(proba_m)
display(proba_w)

[LightGBM] [Info] Number of positive: 18675, number of negative: 39937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6090
[LightGBM] [Info] Number of data points in the train set: 58612, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 14024, number of negative: 29946
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5078
[LightGBM] [Info] Number of data points in the train set: 43970, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


T1_TeamID  T2_TeamID
1101       1102         0.727167
           1103         0.001352
           1104         0.001987
           1105         0.002268
           1106         0.002062
                          ...   
1478       1473         0.329850
           1474         0.149828
           1475         0.484261
           1476         0.909433
           1477         0.170538
Name: Probability, Length: 142506, dtype: float64

T1_TeamID  T2_TeamID
3101       3102         0.948347
           3103         0.042891
           3104         0.027922
           3105         0.165900
           3106         0.623412
                          ...   
3478       3473         0.882607
           3474         0.828744
           3475         0.380347
           3476         0.902433
           3477         0.475784
Name: Probability, Length: 141000, dtype: float64

In [27]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [28]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [29]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [30]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [31]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, proba):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - proba (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        
        team_1_prob = proba.loc[team_1, team_2]
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, proba, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, proba)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [32]:
n_brackets = 100000
result_m = run_simulation(seeds_2024_m, slots_m, proba_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, proba_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 100000/100000 [10:30<00:00, 158.59it/s]
100%|██████████| 100000/100000 [10:17<00:00, 161.93it/s]


In [33]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W14
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
12599995,W,100000,R4Y1,Y03
12599996,W,100000,R4Z1,Z04
12599997,W,100000,R5WX,X04
12599998,W,100000,R5YZ,Z04


In [34]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- https://www.kaggle.com/code/rustyb/paris-madness-2023
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.