<a href="https://www.kaggle.com/code/tcordeu/march-madness-2024?scriptVersionId=166667635" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [94]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)

In [95]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [96]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

In [97]:
def build_results(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    return pd.concat(csvs)

In [98]:
results_m = build_results('M')
results_w = build_results('W')

display(results_m)
display(results_w)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
186547,2024,114,1454,75,1237,70,A,0
186548,2024,114,1455,74,1412,66,A,0
186549,2024,114,1459,91,1359,69,H,0
186550,2024,114,1462,91,1177,58,H,0


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0
...,...,...,...,...,...,...,...,...
130890,2024,114,3409,76,3396,67,A,0
130891,2024,114,3424,63,3129,57,H,0
130892,2024,114,3433,69,3348,59,A,0
130893,2024,114,3453,70,3236,61,A,0


In [99]:
def winner(ids):
    id, wId, lId = ids

    return int(id == wId)

def opponent(x):
    winInt, wId, lId = x
    win = not winInt
    
    return wId if win else lId

def score_diff(x):
    winInt, wScore, lScore = x
    win = not winInt
    
    return (wScore - lScore) if win else (lScore - wScore)

def build_season_results(df):
    season_results = df
    season_results['TeamID'] = season_results[['WTeamID', 'LTeamID']].values.tolist()
    season_results = season_results.explode('TeamID')
    season_results['Win'] = season_results[['TeamID', 'WTeamID', 'LTeamID']].apply(winner, axis=1)
    season_results['Defeat'] = season_results['Win'].apply(lambda x: 1 - x)
    season_results['Games'] = season_results['Win'] + season_results['Defeat']
    season_results['ScoreDiff'] = season_results[['Win', 'WScore', 'LScore']].apply(score_diff, axis=1)
    season_results['OTeamID'] = season_results[['Win', 'WTeamID', 'LTeamID']].apply(opponent, axis=1)
    season_results['Home'] = season_results['WLoc'].apply(lambda x: int(x[0] == 'H'))
    season_results = season_results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc', 'NumOT'], axis=1)
    season_results = season_results.groupby(by=['TeamID', 'OTeamID']).sum()
    season_results['WinRatio'] = season_results['Win'] / season_results['Games']
    season_results = season_results.drop(['Win', 'Defeat'], axis=1)

    return season_results

In [100]:
season_results_m = build_season_results(results_m)
season_results_w = build_season_results(results_w)

display(season_results_m)
display(season_results_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1101,1102,1,-4,0,1.0
1101,1115,1,-8,0,1.0
1101,1116,2,23,2,0.0
1101,1117,2,-7,2,0.5
1101,1122,1,-8,0,1.0
...,...,...,...,...,...
1478,1384,1,-37,1,1.0
1478,1437,1,26,1,0.0
1478,1447,1,23,1,0.0
1478,1467,2,20,1,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3101,3102,1,-36,1,1.0
3101,3106,1,-11,1,1.0
3101,3114,1,-5,0,1.0
3101,3116,2,26,1,0.0
3101,3117,1,-12,0,1.0
...,...,...,...,...,...
3478,3425,1,51,1,0.0
3478,3433,1,23,1,0.0
3478,3447,1,-11,1,1.0
3478,3467,2,-22,1,1.0


In [101]:
def build_teams(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams.drop('TeamName', axis=1)
    teams = teams.set_index('TeamID')
    
    return teams

In [102]:
teams_m = build_teams('M')
teams_w = build_teams('W') # FIXME: Maybe useless since there is no data aside from TeamName.

display(teams_m)
display(teams_w)

Unnamed: 0_level_0,FirstD1Season,LastD1Season
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1
1101,2014,2024
1102,1985,2024
1103,1985,2024
1104,1985,2024
1105,2000,2024
...,...,...
1474,2023,2024
1475,2023,2024
1476,2023,2024
1477,2023,2024


3101
3102
3103
3104
3105
...
3474
3475
3476
3477
3478


In [103]:
def clean_seeds(seed):
    res = seed[1:]

    if len(res) > 2:
        res = res[:-1]

    return int(res)

def build_seeds(gender):
    seeds = CSV["{}NCAATourneySeeds".format(gender)] 
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)
    seeds = seeds.drop('Season', axis=1)
    seeds = seeds.groupby(by='TeamID').mean()
    
    return seeds

In [104]:
seeds_m = build_seeds('M')
seeds_w = build_seeds('W')

display(seeds_m)
display(seeds_w)

Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
1101,14.500000
1102,12.000000
1103,13.600000
1104,5.894737
1105,16.000000
...,...
1459,12.200000
1460,15.000000
1461,10.800000
1462,8.074074


Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
3101,16.000000
3103,13.000000
3104,6.000000
3106,15.333333
3107,14.285714
...,...
3458,7.000000
3460,13.333333
3461,12.500000
3462,5.888889


In [105]:
def build_rankings(gender):
    rankings = CSV["{}MasseyOrdinals".format(gender)]
    rankings = rankings.drop(['SystemName', 'RankingDayNum'], axis=1)
    rankings = rankings.groupby(by='TeamID').mean()
    rankings = rankings.drop('Season', axis=1)

    return rankings

In [106]:
rankings_m = build_rankings('M')

rankings_m

Unnamed: 0_level_0,OrdinalRank
TeamID,Unnamed: 1_level_1
1101,231.982452
1102,183.893379
1103,111.715467
1104,55.569038
1105,319.997840
...,...
1474,227.867379
1475,288.027638
1476,328.846591
1477,294.518844


In [107]:
def build_history(season_results, seeds, teams, rankings=None):
    history = season_results.join(teams, on='TeamID').join(seeds, on='TeamID')
    history = history.reset_index()
    history = pd.merge(history, seeds, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
    history['SeedDiff'] = history['Seed_T'] - history['Seed_O']
    history = history.drop(['Seed_T', 'Seed_O'], axis=1)
    history = history.set_index(['TeamID', 'OTeamID'])
    
    if rankings is not None:
        history = history.join(rankings, on='TeamID')
    
    return history.fillna(0)

In [108]:
history_m = build_history(season_results_m, seeds_m, teams_m, rankings_m)
history_w = build_history(season_results_w, seeds_w, teams_w)

display(history_m)
display(history_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,SeedDiff,OrdinalRank
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1101,1102,1,-4,0,1.0,2014,2024,2.50000,231.982452
1101,1115,1,-8,0,1.0,2014,2024,-1.50000,231.982452
1101,1116,2,23,2,0.0,2014,2024,8.23913,231.982452
1101,1117,2,-7,2,0.5,2014,2024,-0.50000,231.982452
1101,1122,1,-8,0,1.0,2014,2024,0.10000,231.982452
...,...,...,...,...,...,...,...,...,...
1478,1360,1,9,1,0.0,2024,2024,0.00000,320.778736
1478,1364,1,24,1,0.0,2024,2024,0.00000,320.778736
1478,1384,1,-37,1,1.0,2024,2024,0.00000,320.778736
1478,1437,1,26,1,0.0,2024,2024,0.00000,320.778736


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,SeedDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3101,3106,1,-11,1,1.000000,0.666667
3101,3114,1,-5,0,1.000000,3.666667
3101,3116,2,26,1,0.000000,8.375000
3101,3124,2,102,2,0.000000,13.333333
3101,3146,15,-16,7,0.533333,2.000000
...,...,...,...,...,...,...
3478,3357,2,14,0,0.500000,0.000000
3478,3384,1,3,1,0.000000,0.000000
3478,3392,1,21,0,0.000000,0.000000
3478,3425,1,51,1,0.000000,0.000000


In [111]:
def build_avg(history):
    agg = {'Games': 'sum', 'ScoreDiff': 'mean', 'Home': 'sum', 'WinRatio': 'mean', 'SeedDiff': 'mean'}
    if 'OrdinalRank' in history.columns:
        agg['OrdinalRank'] = 'mean'
        agg['FirstD1Season'] = 'mean'
        agg['LastD1Season'] = 'mean'
    
    avg = history.groupby('TeamID').agg(agg)
    
    return avg

In [113]:
avg_m = build_avg(history_m)
avg_w = build_avg(history_w)

display(avg_m)
display(avg_w)

Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,SeedDiff,OrdinalRank,FirstD1Season,LastD1Season
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1101,202,9.930556,120,0.346346,2.842950,231.982452,2014.0,2024.0
1102,978,32.950000,566,0.444344,0.285921,183.893379,1985.0,2024.0
1103,948,-14.933333,590,0.558109,1.760979,111.715467,1985.0,2024.0
1104,1247,-28.252577,725,0.705702,-4.722411,55.569038,1985.0,2024.0
1105,596,42.882979,382,0.118832,4.951648,319.997840,2000.0,2024.0
...,...,...,...,...,...,...,...,...
1474,45,8.185185,32,0.327160,0.000000,227.867379,2023.0,2024.0
1475,31,11.062500,22,0.244792,0.000000,288.027638,2023.0,2024.0
1476,39,18.227273,28,0.162879,0.000000,328.846591,2023.0,2024.0
1477,45,14.391304,23,0.296377,0.000000,294.518844,2023.0,2024.0


Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,SeedDiff
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3101,182,-4.764706,102,0.478283,3.772243
3102,615,98.376344,336,0.202775,0.000000
3103,677,36.882353,374,0.368098,1.785072
3104,721,6.717391,374,0.657250,-4.505748
3105,558,61.821918,345,0.296334,0.000000
...,...,...,...,...,...
3474,33,33.166667,20,0.129630,0.000000
3475,33,5.214286,18,0.392857,0.000000
3476,30,25.375000,16,0.145833,0.000000
3477,42,16.578947,27,0.240351,0.000000


In [114]:
def build_matchups(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams[['TeamID']]
    teams = pd.merge(teams, teams, how='cross')
    teams = teams.rename(columns={'TeamID_x': 'TeamID', 'TeamID_y': 'OTeamID'})
    teams = teams[teams['TeamID'] != teams['OTeamID']]
    teams = teams.set_index(['TeamID', 'OTeamID'])

    return teams

In [115]:
matchups_m = build_matchups('M')
matchups_w = build_matchups('W')

display(matchups_m)
display(matchups_w)

TeamID,OTeamID
1101,1102
1101,1103
1101,1104
1101,1105
1101,1106
...,...
1478,1473
1478,1474
1478,1475
1478,1476


TeamID,OTeamID
3101,3102
3101,3103
3101,3104
3101,3105
3101,3106
...,...
3478,3473
3478,3474
3478,3475
3478,3476


In [116]:
def build_df(history, matchups, avg):
    df = pd.merge(matchups, history, on=['TeamID', 'OTeamID'], how='left')
    df = df.fillna(avg).fillna(0)

    if 'FirstD1Season' in df.columns:
        df['FirstD1Season'] = df['FirstD1Season'].astype(int)
        df['LastD1Season'] = df['LastD1Season'].astype(int)
    
    return df

In [117]:
df_m = build_df(history_m, matchups_m, avg_m)
df_w = build_df(history_w, matchups_w, avg_w)

display(df_m)
display(df_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,SeedDiff,OrdinalRank
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1101,1102,1.0,-4.000000,0.0,1.000000,2014,2024,2.50000,231.982452
1101,1103,202.0,9.930556,120.0,0.346346,2014,2024,2.84295,231.982452
1101,1104,202.0,9.930556,120.0,0.346346,2014,2024,2.84295,231.982452
1101,1105,202.0,9.930556,120.0,0.346346,2014,2024,2.84295,231.982452
1101,1106,202.0,9.930556,120.0,0.346346,2014,2024,2.84295,231.982452
...,...,...,...,...,...,...,...,...,...
1478,1473,17.0,7.142857,13.0,0.285714,2024,2024,0.00000,320.778736
1478,1474,17.0,7.142857,13.0,0.285714,2024,2024,0.00000,320.778736
1478,1475,17.0,7.142857,13.0,0.285714,2024,2024,0.00000,320.778736
1478,1476,17.0,7.142857,13.0,0.285714,2024,2024,0.00000,320.778736


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,SeedDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3101,3102,182.0,-4.764706,102.0,0.478283,3.772243
3101,3103,182.0,-4.764706,102.0,0.478283,3.772243
3101,3104,182.0,-4.764706,102.0,0.478283,3.772243
3101,3105,182.0,-4.764706,102.0,0.478283,3.772243
3101,3106,1.0,-11.000000,1.0,1.000000,0.666667
...,...,...,...,...,...,...
3478,3473,16.0,16.500000,10.0,0.178571,0.000000
3478,3474,16.0,16.500000,10.0,0.178571,0.000000
3478,3475,16.0,16.500000,10.0,0.178571,0.000000
3478,3476,16.0,16.500000,10.0,0.178571,0.000000


In [118]:
corr_m = df_m.corr()
corr_m.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,SeedDiff,OrdinalRank
Games,1.0,-0.037826,0.996534,0.093182,-0.287709,0.16516,-0.034956,-0.077111
ScoreDiff,-0.037826,1.0,-0.021034,-0.491599,0.085927,-0.056504,0.398305,0.387142
Home,0.996534,-0.021034,1.0,0.073625,-0.281736,0.162693,-0.014793,-0.049087
WinRatio,0.093182,-0.491599,0.073625,1.0,-0.218273,0.133133,-0.623153,-0.545768
FirstD1Season,-0.287709,0.085927,-0.281736,-0.218273,1.0,0.019527,0.119842,0.385159
LastD1Season,0.16516,-0.056504,0.162693,0.133133,0.019527,1.0,0.019211,0.261529
SeedDiff,-0.034956,0.398305,-0.014793,-0.623153,0.119842,0.019211,1.0,0.58028
OrdinalRank,-0.077111,0.387142,-0.049087,-0.545768,0.385159,0.261529,0.58028,1.0


In [119]:
corr_w = df_w.corr()
corr_w.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,SeedDiff
Games,1.0,-0.052559,0.995788,0.171917,-0.071757
ScoreDiff,-0.052559,1.0,-0.03552,-0.519959,0.373683
Home,0.995788,-0.03552,1.0,0.152957,-0.052892
WinRatio,0.171917,-0.519959,0.152957,1.0,-0.57042
SeedDiff,-0.071757,0.373683,-0.052892,-0.57042,1.0


In [120]:
corr_m = df_m.corr()['WinRatio'].sort_values(ascending=False)
high_corr_m = corr_m[[abs(corr_m) > 0.4 for corr_m in corr_m]]

corr_w = df_w.corr()['WinRatio'].sort_values(ascending=False)
high_corr_w = corr_w[[abs(corr_w) > 0.4 for corr_w in corr_w]]

display(high_corr_m)
display(high_corr_w)

WinRatio       1.000000
ScoreDiff     -0.491599
OrdinalRank   -0.545768
SeedDiff      -0.623153
Name: WinRatio, dtype: float64

WinRatio     1.000000
ScoreDiff   -0.519959
SeedDiff    -0.570420
Name: WinRatio, dtype: float64

In [121]:
features_m = high_corr_m.index.tolist()[1:]
features_w = high_corr_w.index.tolist()[1:]

display(features_m)
display(features_w)

['ScoreDiff', 'OrdinalRank', 'SeedDiff']

['ScoreDiff', 'SeedDiff']

In [126]:
def score_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': 'cpu',
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, show_progress_bar=True)

    return study.best_params

In [123]:
def build_x_y(df, features):
    target_column = 'WinRatio'
    feature_columns = features
    
    return df[feature_columns], df[target_column]

In [124]:
X_m, y_m = build_x_y(df_m, features_m)
X_w, y_w = build_x_y(df_w, features_w)

In [127]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [130]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))

In [131]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 758
[LightGBM] [Info] Number of data points in the train set: 99754, number of used features: 3
[LightGBM] [Info] Start training from score 0.435542
LightGBM Model accuracy score: 0.1307
LightGBM Model accuracy score [train]: 0.1308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 98700, number of used features: 2
[LightGBM] [Info] Start training from score 0.408290
LightGBM Model accuracy score: 0.6026
LightGBM Model accuracy score [train]: 0.6009


# Prediction

In [132]:
def build_wins(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)

    wins = X
    wins['WinRatio'] = reg.predict(X)
    wins = wins[['WinRatio']]

    return wins

In [133]:
wins_m = build_wins(X_m, y_m, params_m)
wins_w = build_wins(X_w, y_w, params_w)

display(wins_m)
display(wins_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 142506, number of used features: 3
[LightGBM] [Info] Start training from score 0.435858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 141000, number of used features: 2
[LightGBM] [Info] Start training from score 0.408371


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wins['WinRatio'] = reg.predict(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wins['WinRatio'] = reg.predict(X)


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,0.440386
1101,1103,0.428828
1101,1104,0.428828
1101,1105,0.428828
1101,1106,0.428828
...,...,...
1478,1473,0.406580
1478,1474,0.406580
1478,1475,0.406580
1478,1476,0.406580


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,0.484084
3101,3103,0.484084
3101,3104,0.484084
3101,3105,0.484084
3101,3106,0.527165
...,...,...
3478,3473,0.346085
3478,3474,0.346085
3478,3475,0.346085
3478,3476,0.346085


In [134]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [135]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [136]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [137]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [138]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, wins):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - wins (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]

        team_1_prob = wins.loc[team_1, team_2].WinRatio
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, wins, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, wins)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [139]:
n_brackets = 100000
result_m = run_simulation(seeds_2024_m, slots_m, wins_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, wins_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 100/100 [00:01<00:00, 95.24it/s]
100%|██████████| 100/100 [00:01<00:00, 95.97it/s]


In [140]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W15
2,M,1,R1W3,W14
3,M,1,R1W4,W13
4,M,1,R1W5,W05
...,...,...,...,...
12595,W,100,R4Y1,Y03
12596,W,100,R4Z1,Z04
12597,W,100,R5WX,X01
12598,W,100,R5YZ,Z04


In [141]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.