In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna as op
import os
import pandas as pd
import seaborn as sns

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

## History

In [4]:
def build_results(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    return pd.concat(csvs)

In [5]:
results_m = build_results('M')
results_w = build_results('W')

display(results_m)
display(results_w)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
185860,2024,100,1424,67,1201,65,A,0
185861,2024,100,1429,84,1461,76,A,0
185862,2024,100,1454,71,1156,68,A,0
185863,2024,100,1459,73,1273,60,A,0


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0
...,...,...,...,...,...,...,...,...
130230,2024,100,3409,74,3427,70,H,0
130231,2024,100,3424,67,3161,64,H,0
130232,2024,100,3444,65,3296,47,A,0
130233,2024,100,3457,54,3347,51,H,1


In [6]:
def winner(ids):
    id, wId, lId = ids

    return int(id == wId)

def opponent(x):
    winInt, wId, lId = x
    win = not winInt
    
    return wId if win else lId

def score_diff(x):
    winInt, wScore, lScore = x
    win = not winInt
    
    return (wScore - lScore) if win else (lScore - wScore)

def build_season_results(df):
    season_results = df
    season_results['TeamID'] = season_results[['WTeamID', 'LTeamID']].values.tolist()
    season_results = season_results.explode('TeamID')
    season_results['Win'] = season_results[['TeamID', 'WTeamID', 'LTeamID']].apply(winner, axis=1)
    season_results['Defeat'] = season_results['Win'].apply(lambda x: 1 - x)
    season_results['Games'] = 1
    season_results['ScoreDiff'] = season_results[['Win', 'WScore', 'LScore']].apply(score_diff, axis=1)
    season_results['OTeamID'] = season_results[['Win', 'WTeamID', 'LTeamID']].apply(opponent, axis=1)
    season_results = season_results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc'], axis=1)
    season_results = season_results.groupby(by=['TeamID', 'OTeamID']).sum()
    season_results['WinRatio'] = season_results['Win'] / season_results['Games']
    season_results = season_results.drop(['Win', 'Defeat', 'NumOT', 'Games'], axis=1)

    return season_results

In [7]:
season_results_m = build_season_results(results_m)
season_results_w = build_season_results(results_w)

display(season_results_m)
display(season_results_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,ScoreDiff,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1
1101,1102,-4,1.0
1101,1115,-8,1.0
1101,1116,23,0.0
1101,1117,-7,0.5
1101,1122,-8,1.0
...,...,...,...
1478,1384,-37,1.0
1478,1437,26,0.0
1478,1447,23,0.0
1478,1467,20,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,ScoreDiff,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1
3101,3102,-36,1.0
3101,3106,-11,1.0
3101,3114,-5,1.0
3101,3116,26,0.0
3101,3117,-12,1.0
...,...,...,...
3478,3425,51,0.0
3478,3433,23,0.0
3478,3447,-11,1.0
3478,3467,-22,1.0


In [8]:
def build_teams(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams.drop('TeamName', axis=1)
    teams['SeasonsInD1'] = teams['LastD1Season'] - teams['FirstD1Season']
    teams = teams.set_index('TeamID')
    teams = teams.drop(['FirstD1Season', 'LastD1Season'], axis=1)
    
    return teams

In [9]:
teams_m = build_teams('M')

display(teams_m)

Unnamed: 0_level_0,SeasonsInD1
TeamID,Unnamed: 1_level_1
1101,10
1102,39
1103,39
1104,39
1105,24
...,...
1474,1
1475,1
1476,1
1477,1


In [10]:
def clean_seeds(seed):
    res = seed[1:]

    if len(res) > 2:
        res = res[:-1]

    return int(res)

def build_seeds(gender):
    seeds = CSV["{}NCAATourneySeeds".format(gender)] 
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)
    seeds = seeds.drop('Season', axis=1)
    seeds = seeds.groupby(by='TeamID').mean()
    
    return seeds

In [11]:
seeds_m = build_seeds('M')
seeds_w = build_seeds('W')

display(seeds_m)
display(seeds_w)

Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
1101,14.500000
1102,12.000000
1103,13.600000
1104,5.894737
1105,16.000000
...,...
1459,12.200000
1460,15.000000
1461,10.800000
1462,8.074074


Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
3101,16.000000
3103,13.000000
3104,6.000000
3106,15.333333
3107,14.285714
...,...
3458,7.000000
3460,13.333333
3461,12.500000
3462,5.888889


In [12]:
def build_rankings(gender):
    rankings = CSV["{}MasseyOrdinals".format(gender)]
    rankings = rankings.drop(['SystemName', 'RankingDayNum'], axis=1)
    rankings = rankings.groupby(by='TeamID').mean()
    rankings = rankings.drop('Season', axis=1)

    return rankings

In [13]:
rankings_m = build_rankings('M')

rankings_m

Unnamed: 0_level_0,OrdinalRank
TeamID,Unnamed: 1_level_1
1101,231.948429
1102,183.409557
1103,111.840219
1104,55.914665
1105,319.892673
...,...
1474,224.839731
1475,284.956931
1476,326.853180
1477,292.496635


In [14]:
def build_history(season_results, seeds, teams=None, rankings=None):
    history = season_results.join(seeds, on='TeamID')
    
    if teams is not None:
        history = history.join(teams, on='TeamID')
    
    if rankings is not None:
        history = history.join(rankings, on='TeamID')
    
    return history

In [15]:
history_m = build_history(season_results_m, seeds_m, teams_m, rankings_m)
history_w = build_history(season_results_w, seeds_w)

display(history_m)
display(history_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,ScoreDiff,WinRatio,Seed,SeasonsInD1,OrdinalRank
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1101,1102,-4,1.0,14.5,10,231.948429
1101,1115,-8,1.0,14.5,10,231.948429
1101,1116,23,0.0,14.5,10,231.948429
1101,1117,-7,0.5,14.5,10,231.948429
1101,1122,-8,1.0,14.5,10,231.948429
...,...,...,...,...,...,...
1478,1384,-37,1.0,,0,322.377966
1478,1437,26,0.0,,0,322.377966
1478,1447,23,0.0,,0,322.377966
1478,1467,20,0.0,,0,322.377966


Unnamed: 0_level_0,Unnamed: 1_level_0,ScoreDiff,WinRatio,Seed
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3101,3102,-36,1.0,16.0
3101,3106,-11,1.0,16.0
3101,3114,-5,1.0,16.0
3101,3116,26,0.0,16.0
3101,3117,-12,1.0,16.0
...,...,...,...,...
3478,3425,51,0.0,
3478,3433,23,0.0,
3478,3447,-11,1.0,
3478,3467,-22,1.0,


### Feature analysis

In [16]:
corr = history_m.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,ScoreDiff,WinRatio,Seed,SeasonsInD1,OrdinalRank
ScoreDiff,1.0,-0.41047,0.233303,-0.070563,0.281576
WinRatio,-0.41047,1.0,-0.409773,0.155069,-0.440576
Seed,0.233303,-0.409773,1.0,-0.307741,0.885872
SeasonsInD1,-0.070563,0.155069,-0.307741,1.0,-0.334358
OrdinalRank,0.281576,-0.440576,0.885872,-0.334358,1.0


## T1 vs T2

In [17]:
def build_tx(gender):
    """
    Build the DF that includes T1 vs T2 and T2 vs T1 matchups.
    Concat two exact same DFs, but in one replace:
    - W => T1
    - L => T2
    and in the other one:
    - W => T2
    - L => T1
    """
    csv_names = ['NCAATourneyDetailedResults', 'RegularSeasonDetailedResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x].copy(), csv_names))

    results_t1 = pd.concat(csvs)
    results_t1 = results_t1.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)
    
    results_t2 = results_t1.copy()
        
    results_t1.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(results_t1.columns)]
    results_t2.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(results_t2.columns)]
    
    results = pd.concat([results_t1, results_t2]).reset_index(drop=True)
    results['ScoreDiff'] = results['T1_Score'] - results['T2_Score']
    results['Win'] = np.where(results['ScoreDiff'] > 0, 1, 0)

    return results

def build_t1_t2(gender):
    """
    Generate two DFs:
    - One preffixed by T1_
    - One preffixed by T2_
    """

    t1 = build_tx(gender)
    t1 = t1.drop(['T1_Score', 'T2_Score', 'T2_TeamID'], axis=1)
    t1 = t1.groupby(by=['Season', 'T1_TeamID']).mean()
    t2 = t1.copy()
    t2.index.names = ['Season', 'T2_TeamID']

    t1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t1.columns)]
    t2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") + "_Avg" for x in list(t2.columns)]
    
    return (t1, t2)

In [18]:
def build_matchups(gender):
    """
    Generate a matchup DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    teams = CSV["{}Teams".format(gender)].copy()
    teams['T1_TeamID'] = teams['TeamID']
    teams['T2_TeamID'] = [teams['TeamID'].values.tolist() for i in teams.index]
    teams = teams.explode('T2_TeamID')
    teams = teams.groupby(['T1_TeamID', 'T2_TeamID']).sum()
    teams = teams.reset_index()
    teams = teams[['T1_TeamID', 'T2_TeamID']]
    teams = teams[teams['T1_TeamID'] != teams['T2_TeamID']]
    teams = teams.set_index(['T1_TeamID', 'T2_TeamID'])

    return teams

In [19]:
def build_fill(gender):
    """
    Build a DF with the same format as DF, used to fill NaN matchup values with overall AVGs.
    """
    matchups = build_matchups(gender)
    t1, t2   = build_t1_t2(gender)
    
    t1 = t1.groupby('T1_TeamID').mean()
    t2 = t2.groupby('T2_TeamID').mean()
    
    matchups = matchups.join(t1, on=['T1_TeamID'], how='inner')
    matchups = matchups.join(t2, on=['T2_TeamID'], how='inner')
    
    return matchups

In [20]:
def build_tourney(gender):
    """
    Generate a tourney DF. Each entry has matchups T1 vs each team (T2) as an Index in a 1-N relation.
    """
    tourney  = build_tx(gender)[['Season', 'T1_TeamID', 'T2_TeamID' , 'Win']]
    t1, t2   = build_t1_t2(gender)
    matchups = build_matchups(gender)
    fill     = build_fill(gender)
    
    tourney = pd.merge(matchups, tourney, on=['T1_TeamID', 'T2_TeamID'], how='left')
    tourney = pd.merge(tourney, t1, on=['Season', 'T1_TeamID'], how='left')
    tourney = pd.merge(tourney, t2, on=['Season', 'T2_TeamID'], how='left')
    tourney = tourney.groupby(by=['T1_TeamID', 'T2_TeamID']).mean()
    tourney = tourney.drop('Season', axis=1)
    tourney = tourney.fillna(fill)
    
    return tourney

In [21]:
def build_train_tourney(gender, history):
    """
    Build a train and tourney dataset, Train includes the Win column while Tourney may have NaNs.
    """
    tourney = build_tourney(gender)
    tourney = pd.merge(tourney, history, left_index=True, right_on=['TeamID', 'OTeamID'], how='left')

    train = tourney[tourney['Win'].notnull()]
    train['Win'] = train['Win'].astype(int)
    
    return (train, tourney)

In [22]:
train_m, tourney_m = build_train_tourney('M', history_m)
train_w, tourney_w = build_train_tourney('W', history_w)

display(train_m)
display(tourney_m)
display(train_w)
display(tourney_w)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Win'] = train['Win'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Win'] = train['Win'].astype(int)


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg,ScoreDiff,WinRatio,Seed,SeasonsInD1,OrdinalRank
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1,25.518519,56.481481,6.481481,19.962963,13.148148,18.666667,9.037037,23.222222,13.888889,...,5.586207,3.586207,18.724138,-6.034483,0.344828,-4.0,1.0,14.5,10.0,231.948429
1101,1115,1,21.428571,52.928571,7.250000,19.178571,10.892857,14.964286,8.250000,19.642857,11.857143,...,7.500000,2.718750,17.062500,-7.125000,0.375000,-8.0,1.0,14.5,10.0,231.948429
1101,1116,0,25.353636,59.185455,6.470909,18.569091,15.126364,21.245455,8.797273,22.140909,14.390909,...,6.494792,3.250000,13.817708,3.833333,0.640625,23.0,0.0,14.5,10.0,231.948429
1101,1117,0,25.292593,55.824074,6.824074,19.498148,13.324074,18.716667,8.951852,22.911111,14.161111,...,5.816667,3.666667,19.300000,-6.250000,0.333333,-7.0,0.5,14.5,10.0,231.948429
1101,1122,1,26.480000,59.280000,7.760000,21.320000,13.480000,19.400000,9.640000,22.600000,17.600000,...,7.720000,2.520000,4.080000,1.560000,0.480000,-8.0,1.0,14.5,10.0,231.948429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1384,1,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,7.136364,4.272727,16.727273,-10.318182,0.272727,-37.0,1.0,,0.0,322.377966
1478,1437,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,5.666667,4.166667,15.000000,5.916667,0.541667,26.0,0.0,,0.0,322.377966
1478,1447,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,4.750000,3.250000,13.750000,-2.400000,0.450000,23.0,0.0,,0.0,322.377966
1478,1467,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,7.086957,3.521739,16.652174,2.000000,0.565217,20.0,0.0,,0.0,322.377966


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg,ScoreDiff,WinRatio,Seed,SeasonsInD1,OrdinalRank
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1.0,25.518519,56.481481,6.481481,19.962963,13.148148,18.666667,9.037037,23.222222,13.888889,...,5.586207,3.586207,18.724138,-6.034483,0.344828,-4.0,1.0,14.5,10.0,231.948429
1101,1103,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,6.207280,2.720456,18.263107,4.756395,0.647229,,,,,
1101,1104,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,6.455794,3.669305,18.168998,4.540364,0.602638,,,,,
1101,1105,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,7.748847,3.745060,18.209819,-6.540094,0.319781,,,,,
1101,1106,,24.192179,55.786654,6.754721,19.318635,14.414827,20.083464,8.536771,21.543447,13.443715,...,7.166614,3.548025,18.176961,-4.119900,0.403574,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,6.797022,3.335423,17.122257,-9.496865,0.251567,,,,,
1478,1474,,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,6.473333,4.483333,17.670000,-3.430000,0.410000,,,,,
1478,1475,,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,6.796238,3.797022,18.758621,-4.670063,0.315047,,,,,
1478,1476,1.0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,8.040000,4.280000,14.080000,-15.120000,0.080000,-31.0,1.0,,0.0,322.377966


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg,ScoreDiff,WinRatio,Seed
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,1,21.720000,57.600000,7.280000,26.040000,14.320000,20.440000,15.320000,24.560000,13.040000,...,17.666667,14.400000,9.133333,4.333333,17.266667,-23.500000,0.066667,-36.0,1.0,16.0
3101,3106,1,25.555556,58.000000,8.296296,24.444444,10.444444,15.111111,9.666667,22.000000,13.925926,...,12.500000,15.566667,8.800000,4.300000,19.066667,-7.500000,0.500000,-11.0,1.0,16.0
3101,3114,1,24.269231,57.423077,8.730769,25.076923,13.653846,18.192308,9.653846,20.076923,15.076923,...,9.760000,16.520000,6.080000,2.440000,16.840000,1.280000,0.600000,-5.0,1.0,16.0
3101,3116,0,23.893899,54.519231,8.144562,24.625332,12.529178,16.830239,11.941645,23.077586,14.740053,...,14.612903,15.834440,5.896584,3.833966,16.904649,-1.771347,0.503795,26.0,0.0,16.0
3101,3117,1,24.269231,57.423077,8.730769,25.076923,13.653846,18.192308,9.653846,20.076923,15.076923,...,15.680000,15.680000,7.720000,2.960000,18.480000,0.040000,0.360000,-12.0,1.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3425,0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,12.500000,15.500000,6.000000,2.545455,18.272727,13.636364,0.818182,51.0,0.0,
3478,3433,0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,9.166667,13.416667,6.375000,2.541667,17.083333,9.875000,0.833333,23.0,0.0,
3478,3447,1,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,15.600000,17.100000,9.450000,2.800000,17.700000,-15.300000,0.250000,-11.0,1.0,
3478,3467,1,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,12.043478,16.217391,8.782609,3.782609,18.130435,-4.130435,0.347826,-22.0,1.0,


Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg,ScoreDiff,WinRatio,Seed
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3101,3102,1.0,21.720000,57.600000,7.280000,26.040000,14.320000,20.440000,15.320000,24.560000,13.040000,...,17.666667,14.400000,9.133333,4.333333,17.266667,-23.500000,0.066667,-36.0,1.0,16.0
3101,3103,,24.106831,56.689811,8.243338,25.274662,13.325945,18.507925,11.928763,23.537225,14.138603,...,13.357218,15.505805,8.000830,2.873861,17.908876,0.000095,0.501941,,,
3101,3104,,24.106831,56.689811,8.243338,25.274662,13.325945,18.507925,11.928763,23.537225,14.138603,...,12.507329,17.362126,7.600154,3.829241,18.453844,2.548561,0.526016,,,
3101,3105,,24.106831,56.689811,8.243338,25.274662,13.325945,18.507925,11.928763,23.537225,14.138603,...,12.461237,16.515246,9.081460,3.347263,18.116089,-5.771821,0.383722,,,
3101,3106,1.0,25.555556,58.000000,8.296296,24.444444,10.444444,15.111111,9.666667,22.000000,13.925926,...,12.500000,15.566667,8.800000,4.300000,19.066667,-7.500000,0.500000,-11.0,1.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,14.202857,15.383810,8.733333,2.602857,17.485714,-13.200000,0.139048,,,
3478,3474,,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,13.891026,16.590659,7.121795,3.244505,17.670330,-14.283883,0.167582,,,
3478,3475,,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,12.153846,15.778846,9.726923,3.096154,18.338462,-0.155769,0.523077,,,
3478,3476,1.0,20.260870,57.000000,4.913043,17.347826,10.347826,13.913043,9.739130,20.086957,11.782609,...,14.217391,14.043478,11.565217,3.434783,16.608696,-17.695652,0.130435,-21.0,1.0,


### Feature analysis

In [23]:
train_m

Unnamed: 0_level_0,Unnamed: 1_level_0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,...,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg,ScoreDiff,WinRatio,Seed,SeasonsInD1,OrdinalRank
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1101,1102,1,25.518519,56.481481,6.481481,19.962963,13.148148,18.666667,9.037037,23.222222,13.888889,...,5.586207,3.586207,18.724138,-6.034483,0.344828,-4.0,1.0,14.5,10.0,231.948429
1101,1115,1,21.428571,52.928571,7.250000,19.178571,10.892857,14.964286,8.250000,19.642857,11.857143,...,7.500000,2.718750,17.062500,-7.125000,0.375000,-8.0,1.0,14.5,10.0,231.948429
1101,1116,0,25.353636,59.185455,6.470909,18.569091,15.126364,21.245455,8.797273,22.140909,14.390909,...,6.494792,3.250000,13.817708,3.833333,0.640625,23.0,0.0,14.5,10.0,231.948429
1101,1117,0,25.292593,55.824074,6.824074,19.498148,13.324074,18.716667,8.951852,22.911111,14.161111,...,5.816667,3.666667,19.300000,-6.250000,0.333333,-7.0,0.5,14.5,10.0,231.948429
1101,1122,1,26.480000,59.280000,7.760000,21.320000,13.480000,19.400000,9.640000,22.600000,17.600000,...,7.720000,2.520000,4.080000,1.560000,0.480000,-8.0,1.0,14.5,10.0,231.948429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1384,1,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,7.136364,4.272727,16.727273,-10.318182,0.272727,-37.0,1.0,,0.0,322.377966
1478,1437,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,5.666667,4.166667,15.000000,5.916667,0.541667,26.0,0.0,,0.0,322.377966
1478,1447,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,4.750000,3.250000,13.750000,-2.400000,0.450000,23.0,0.0,,0.0,322.377966
1478,1467,0,25.095238,59.238095,9.809524,27.428571,10.761905,13.904762,7.380952,21.857143,15.142857,...,7.086957,3.521739,16.652174,2.000000,0.565217,20.0,0.0,,0.0,322.377966


In [24]:
corr = train_m.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Win,T1_FGM_Avg,T1_FGA_Avg,T1_FGM3_Avg,T1_FGA3_Avg,T1_FTM_Avg,T1_FTA_Avg,T1_OR_Avg,T1_DR_Avg,T1_Ast_Avg,T1_TO_Avg,T1_Stl_Avg,T1_Blk_Avg,T1_PF_Avg,T1_opponent_FGM_Avg,T1_opponent_FGA_Avg,T1_opponent_FGM3_Avg,T1_opponent_FGA3_Avg,T1_opponent_FTM_Avg,T1_opponent_FTA_Avg,T1_opponent_OR_Avg,T1_opponent_DR_Avg,T1_opponent_Ast_Avg,T1_opponent_TO_Avg,T1_opponent_Stl_Avg,T1_opponent_Blk_Avg,T1_opponent_PF_Avg,T1_ScoreDiff_Avg,T1_Win_Avg,T2_FGM_Avg,T2_FGA_Avg,T2_FGM3_Avg,T2_FGA3_Avg,T2_FTM_Avg,T2_FTA_Avg,T2_OR_Avg,T2_DR_Avg,T2_Ast_Avg,T2_TO_Avg,T2_Stl_Avg,T2_Blk_Avg,T2_PF_Avg,T2_opponent_FGM_Avg,T2_opponent_FGA_Avg,T2_opponent_FGM3_Avg,T2_opponent_FGA3_Avg,T2_opponent_FTM_Avg,T2_opponent_FTA_Avg,T2_opponent_OR_Avg,T2_opponent_DR_Avg,T2_opponent_Ast_Avg,T2_opponent_TO_Avg,T2_opponent_Stl_Avg,T2_opponent_Blk_Avg,T2_opponent_PF_Avg,T2_ScoreDiff_Avg,T2_Win_Avg,ScoreDiff,WinRatio,Seed,SeasonsInD1,OrdinalRank
Win,1.0,0.233923,0.073342,0.089488,0.028619,0.133261,0.092696,0.060834,0.197128,0.244576,-0.199776,0.052043,0.200095,-0.105267,-0.167444,0.021632,-0.079576,0.002754,-0.183335,-0.186403,-0.053103,-0.198422,-0.190847,-0.013657,-0.168624,-0.058464,0.066786,0.374823,0.359697,-0.200054,-0.048262,-0.079287,-0.017225,-0.117029,-0.078113,-0.058814,-0.171524,-0.223417,0.178336,-0.039077,-0.171234,0.116932,0.167939,-0.001336,0.081895,0.012979,0.167477,0.172412,0.045929,0.192397,0.177654,0.010788,0.159498,0.065718,-0.035659,-0.342953,-0.327168,-0.244484,0.803871,-0.360927,0.136596,-0.392048
T1_FGM_Avg,0.233923,1.0,0.728588,0.399867,0.276044,0.259282,0.171067,0.148132,0.512777,0.681589,-0.300362,0.220536,0.325227,-0.159944,0.283559,0.551122,0.191862,0.286703,-0.137484,-0.170337,-0.025969,-0.08693,-0.000707,0.042326,-0.185805,-0.073855,0.032118,0.606272,0.571173,0.164944,0.158318,0.134913,0.143232,-0.000688,-0.035351,-0.097311,0.098545,0.062492,-0.099112,-0.0187,0.02261,-0.084059,0.131257,0.158956,0.148299,0.171219,-0.036667,-0.062143,-0.119678,0.056082,0.029167,-0.085983,-0.039622,-0.013634,-0.069251,0.033855,0.044457,-0.176247,0.256487,-0.354782,0.080249,-0.421239
T1_FGA_Avg,0.073342,0.728588,1.0,0.329397,0.403792,0.118897,0.108698,0.349073,0.37019,0.309997,-0.154547,0.289702,0.242584,0.014658,0.491742,0.629579,0.282761,0.334482,0.137691,0.121091,0.107611,0.430088,0.262061,0.159263,-0.039127,0.208491,-0.027986,0.154466,0.134374,0.158318,0.174846,0.147571,0.171775,-0.026499,-0.059582,-0.124635,0.119261,0.046809,-0.101594,-0.026819,0.022001,-0.097037,0.144407,0.18594,0.175609,0.211093,-0.042868,-0.068771,-0.136093,0.089691,0.028286,-0.106773,-0.025875,-0.004292,-0.093353,0.01405,0.01749,-0.049121,0.070511,-0.14429,-0.001257,-0.107264
T1_FGM3_Avg,0.089488,0.399867,0.329397,1.0,0.923646,-0.121379,-0.256074,-0.308587,0.195139,0.364187,-0.380225,-0.065437,-0.086219,-0.198619,0.262941,0.285462,0.238675,0.272648,-0.173334,-0.220947,-0.216024,0.195297,0.027544,-0.169917,-0.264123,-0.213223,-0.185369,0.258895,0.239882,0.134913,0.147571,0.157336,0.181764,-0.05433,-0.091515,-0.152824,0.103139,0.020494,-0.125405,-0.048995,-0.009458,-0.10255,0.127085,0.140263,0.229448,0.252264,-0.054845,-0.088386,-0.199349,0.088375,0.031092,-0.126447,-0.064867,-0.027049,-0.108833,-0.000559,0.003668,-0.062589,0.097671,-0.071697,-0.063601,-0.139124
T1_FGA3_Avg,0.028619,0.276044,0.403792,0.923646,1.0,-0.203423,-0.296775,-0.280933,0.139348,0.199377,-0.323205,-0.028874,-0.104687,-0.173786,0.328609,0.31247,0.285467,0.320892,-0.102307,-0.146896,-0.21138,0.381009,0.105695,-0.146905,-0.187423,-0.144148,-0.240103,0.073048,0.061204,0.143232,0.171775,0.181764,0.221636,-0.082536,-0.125965,-0.201763,0.114041,0.005833,-0.157167,-0.056243,-0.016475,-0.136661,0.144834,0.164863,0.262842,0.303817,-0.071882,-0.110503,-0.250917,0.111368,0.025399,-0.160438,-0.068267,-0.024474,-0.151961,-0.01004,-0.008336,-0.014212,0.025057,0.010664,-0.102259,-0.018383
T1_FTM_Avg,0.133261,0.259282,0.118897,-0.121379,-0.203423,1.0,0.931281,0.337977,0.319097,0.16329,0.083571,0.120885,0.218742,0.238782,0.05642,0.22476,-0.052018,-0.021943,0.195192,0.205392,0.216695,-0.162713,-0.020728,0.185617,-0.006242,0.156177,0.641645,0.342986,0.360524,-0.000688,-0.026499,-0.05433,-0.082536,0.127312,0.142363,0.112963,0.015377,0.024558,0.036036,0.007086,0.036813,0.135313,-0.03766,-0.038323,-0.086066,-0.107991,0.118645,0.130517,0.133043,-0.007737,-0.012104,0.055654,-0.021266,0.02827,0.146696,0.026986,0.030143,-0.102747,0.149608,-0.171456,0.098331,-0.227623
T1_FTA_Avg,0.092696,0.171067,0.108698,-0.256074,-0.296775,0.931281,1.0,0.483669,0.285217,0.0908,0.235123,0.184759,0.251592,0.32474,0.028702,0.191331,-0.106262,-0.081786,0.27617,0.310985,0.331464,-0.093973,0.010653,0.289604,0.099644,0.223463,0.697828,0.242849,0.266484,-0.035351,-0.059582,-0.091515,-0.125965,0.142363,0.171542,0.167649,-0.003462,0.02078,0.085118,0.026369,0.037846,0.173892,-0.064958,-0.070366,-0.133549,-0.163261,0.136839,0.163338,0.2066,-0.01552,-0.008303,0.105944,0.002137,0.039188,0.183732,0.021675,0.023972,-0.073194,0.102145,-0.115487,0.104863,-0.148218
T1_OR_Avg,0.060834,0.148132,0.349073,-0.308587,-0.280933,0.337977,0.483669,1.0,0.140286,0.097082,0.365816,0.305053,0.309832,0.338825,-0.115851,0.021696,-0.227923,-0.225224,0.23158,0.301627,0.518191,-0.116526,0.056809,0.434441,0.187759,0.276856,0.405124,0.167243,0.152225,-0.097311,-0.124635,-0.152824,-0.201763,0.112963,0.167649,0.316985,-0.037785,0.055507,0.242162,0.095977,0.054064,0.206019,-0.123104,-0.111956,-0.214486,-0.255501,0.080413,0.13262,0.404155,-0.060146,0.022401,0.255183,0.097425,0.039781,0.23776,0.029193,0.032456,-0.050413,0.070054,-0.154082,0.164948,-0.143788
T1_DR_Avg,0.197128,0.512777,0.37019,0.195139,0.139348,0.319097,0.285217,0.140286,1.0,0.372703,-0.116659,-0.225744,0.395025,-0.140384,0.014528,0.550917,0.155063,0.335614,-0.221448,-0.210648,-0.028788,0.040005,-0.205878,-0.289419,-0.087022,-0.04064,0.160274,0.483158,0.466704,0.098545,0.119261,0.103139,0.114041,0.015377,-0.003462,-0.037785,0.18864,0.023863,-0.08906,-0.059989,0.028276,-0.009193,0.06848,0.106989,0.11733,0.137279,-0.003314,-0.016835,-0.042844,0.169808,0.000985,-0.059312,-0.110708,0.021068,-0.001577,0.025323,0.031974,-0.153313,0.219981,-0.308147,0.122192,-0.387499
T1_Ast_Avg,0.244576,0.681589,0.309997,0.364187,0.199377,0.16329,0.0908,0.097082,0.372703,1.0,-0.159743,0.182625,0.250577,-0.159126,0.005097,0.256185,0.005916,0.08693,-0.239234,-0.246511,0.040453,-0.271577,-0.021811,0.101676,-0.076003,-0.161781,0.063253,0.609519,0.565385,0.062492,0.046809,0.020494,0.005833,0.024558,0.02078,0.055507,0.023863,0.071041,0.043546,0.042281,0.02478,-0.001716,0.029158,0.045136,0.018584,0.013288,-0.012883,-0.010831,0.057248,-0.013915,0.055728,0.059175,0.025632,-0.010102,0.01779,0.032574,0.043313,-0.173289,0.276034,-0.400592,0.131235,-0.475381


In [25]:
corr = train_m.corr()['Win'].sort_values(ascending=False)
high_corr = corr[[abs(corr) > 0.3 for corr in corr]]

display(high_corr)

Win                 1.000000
WinRatio            0.803871
T1_ScoreDiff_Avg    0.374823
T1_Win_Avg          0.359697
T2_Win_Avg         -0.327168
T2_ScoreDiff_Avg   -0.342953
Seed               -0.360927
OrdinalRank        -0.392048
Name: Win, dtype: float64

In [26]:
features = high_corr.index.tolist()[1:]

features

['WinRatio',
 'T1_ScoreDiff_Avg',
 'T1_Win_Avg',
 'T2_Win_Avg',
 'T2_ScoreDiff_Avg',
 'Seed',
 'OrdinalRank']

# Model building

In [27]:
def score_dataset(lgbm_params, X, y):
    classifier = lgb.LGBMClassifier(**lgbm_params, class_weight='balanced')
    score      = cross_val_score(classifier, X, y, scoring='neg_brier_score')
    score      = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': 'cpu',
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=10, show_progress_bar=True)

    return study.best_params

In [28]:
def build_x_y(df, features):
    target_column = 'Win'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [29]:
X_m, y_m = build_x_y(train_m, features)
X_w, y_w = build_x_y(train_w, features)

In [30]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [31]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    clf_test = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(clf_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(clf_test.score(X_train, y_train)))

In [32]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Number of positive: 10753, number of negative: 21074
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15194
[LightGBM] [Info] Number of data points in the train set: 31827, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Model accuracy score: 0.9596
LightGBM Model accuracy score [train]: 0.9587
[LightGBM] [Info] Number of positive: 7293, number of negative: 14817
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14889
[LightGBM] [Info] Number of data points in the train set: 22110, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> ini

# Prediction

In [33]:
def build_proba(X, y, tourney, params):
    clf = lgb.LGBMClassifier(**params, class_weight='balanced')
    clf.fit(X, y)
    
    tourney = tourney.drop('Win', axis=1)
    results = tourney

    pred = clf.predict_proba(results)
    results['Probability'] = list(map(lambda x: x[1], pred))

    return results['Probability']

In [34]:
proba_m = build_proba(X_m, y_m, tourney_m, params_m)
proba_w = build_proba(X_w, y_w, tourney_w, params_w)

display(proba_m)
display(proba_w)

[LightGBM] [Info] Number of positive: 15410, number of negative: 30058
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15198
[LightGBM] [Info] Number of data points in the train set: 45468, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 10375, number of negative: 21211
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14893
[LightGBM] [Info] Number of data points in the train set: 31586, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


TeamID  OTeamID
1101    1102       0.932458
        1103       0.073263
        1104       0.073263
        1105       0.073263
        1106       0.073263
                     ...   
1478    1473       0.073263
        1474       0.073263
        1475       0.073263
        1476       0.932458
        1477       0.073263
Name: Probability, Length: 142506, dtype: float64

TeamID  OTeamID
3101    3102       0.830100
        3103       0.228265
        3104       0.223278
        3105       0.261818
        3106       0.786438
                     ...   
3478    3473       0.304098
        3474       0.300265
        3475       0.213176
        3476       0.828935
        3477       0.213699
Name: Probability, Length: 141000, dtype: float64

In [35]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [36]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [37]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [38]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [39]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, proba):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - proba (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        
        team_1_prob = proba.loc[team_1, team_2]
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, proba, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, proba)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [40]:
n_brackets = 5000
result_m = run_simulation(seeds_2024_m, slots_m, proba_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, proba_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 5000/5000 [00:30<00:00, 166.51it/s]
100%|██████████| 5000/5000 [00:30<00:00, 165.85it/s]


In [41]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W16
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
629995,W,5000,R4Y1,Y03
629996,W,5000,R4Z1,Z07
629997,W,5000,R5WX,W03
629998,W,5000,R5YZ,Y03


In [42]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- https://www.kaggle.com/code/rustyb/paris-madness-2023
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.