# NCAA 2024 Submission

## Summary
* チーム間の勝率を予測 -> シミュレーションでBracketsを作成しSubmissionファイルを作成
* シミュレーションは[Simulate n Brackets](https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets/notebook)参照
* 予測モデルはXGBモデルとロジスティック回帰モデルのアンサンブル
  * XGBモデルは[2023年の1st solution](https://www.kaggle.com/code/rustyb/paris-madness-2023)参照
  * ロジスティック回帰モデルは[2022年Womensの2nd solution](https://www.kaggle.com/competitions/ncaaw-march-mania-2021/discussion/230705)参照(538ratingは今年は使えないので、代わりにEloRatingを実装)

## Library

In [1]:
import numpy as np
import pandas as pd

import os

import xgboost as xgb
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

from scipy.interpolate import UnivariateSpline
import statsmodels.api as sm

import matplotlib.pyplot as plt
import collections

import itertools
from tqdm import tqdm

In [2]:
DATA_PATH = '/kaggle/input/march-machine-learning-mania-2024/'

## Create submission file(2023 format)

In [3]:
tourney_2024 = pd.read_csv(DATA_PATH + "2024_tourney_seeds.csv")
tourney_2024['seed'] = tourney_2024['Seed'].apply(lambda x: int(x[1:3]))

tourney_2024_mens = tourney_2024.query('Tournament == "M"')
tourney_2024_womens = tourney_2024.query('Tournament == "W"')

comb_mens = pd.DataFrame(data=list(itertools.combinations(tourney_2024_mens['TeamID'], 2)))
comb_womens = pd.DataFrame(data=list(itertools.combinations(tourney_2024_womens['TeamID'], 2)))
comb_merged = pd.concat([comb_mens, comb_womens])

comb_merged['T1_TeamID'] = np.minimum(np.array(comb_merged[0]) ,np.array(comb_merged[1]))
comb_merged['T2_TeamID'] = np.maximum(np.array(comb_merged[0]) ,np.array(comb_merged[1]))
comb_merged = comb_merged.sort_values(['T1_TeamID', 'T2_TeamID']).reset_index()
comb_merged = comb_merged[['T1_TeamID', 'T2_TeamID']]

comb_merged['Season'] = 2024
comb_merged['ID'] = comb_merged['Season'].astype(str) + '_' + comb_merged['T1_TeamID'].astype(str) + '_' + comb_merged['T2_TeamID'].astype(str)

sample_sub = comb_merged[['ID', 'Season', 'T1_TeamID', 'T2_TeamID']]
sample_sub_mens = sample_sub.query('T1_TeamID <= 2000')
sample_sub_womens = sample_sub.query('T1_TeamID >= 3000')

sample_sub.tail()

Unnamed: 0,ID,Season,T1_TeamID,T2_TeamID
4027,2024_3439_3453,2024,3439,3453
4028,2024_3439_3465,2024,3439,3465
4029,2024_3452_3453,2024,3452,3453
4030,2024_3452_3465,2024,3452,3465
4031,2024_3453_3465,2024,3453,3465


## Logistic Regression Model

### Load Data

In [4]:
train_df_mens = pd.read_csv("/kaggle/input/ncaa-2024/train_mens.csv") 
train_df_womens = pd.read_csv("/kaggle/input/ncaa-2024/train_womens.csv") 

feature_mens = pd.read_csv("/kaggle/input/ncaa-2024/features_mens.csv")
feature_womens = pd.read_csv("/kaggle/input/ncaa-2024/features_womens.csv")

### Create Faature

In [5]:
feature_mens_t1 = feature_mens.query('Season == 2024')[['TeamID', 'Seed', 'Rating', 'win_rate', 'gap_avg']].add_prefix('T1_')
feature_mens_t2 = feature_mens.query('Season == 2024')[['TeamID', 'Seed', 'Rating', 'win_rate', 'gap_avg']].add_prefix('T2_')

pred_lr_mens = pd.merge(sample_sub_mens, feature_mens_t1, on='T1_TeamID', how='left')
pred_lr_mens = pd.merge(pred_lr_mens, feature_mens_t2, on='T2_TeamID', how='left')

pred_lr_mens['diff_seed'] = pred_lr_mens['T1_Seed'] - pred_lr_mens['T2_Seed']
pred_lr_mens['diff_rating'] = pred_lr_mens['T1_Rating'] - pred_lr_mens['T2_Rating']
pred_lr_mens['diff_win_rate'] = pred_lr_mens['T1_win_rate'] - pred_lr_mens['T2_win_rate']
pred_lr_mens['diff_gap_avg'] = pred_lr_mens['T1_gap_avg'] - pred_lr_mens['T2_gap_avg']

In [6]:
feature_womens_t1 = feature_womens.query('Season == 2024')[['TeamID', 'Seed', 'Rating', 'win_rate', 'gap_avg']].add_prefix('T1_')
feature_womens_t2 = feature_womens.query('Season == 2024')[['TeamID', 'Seed', 'Rating', 'win_rate', 'gap_avg']].add_prefix('T2_')

pred_lr_womens = pd.merge(sample_sub_womens, feature_womens_t1, on='T1_TeamID', how='left')
pred_lr_womens = pd.merge(pred_lr_womens, feature_womens_t2, on='T2_TeamID', how='left')

pred_lr_womens['diff_seed'] = pred_lr_womens['T1_Seed'] - pred_lr_womens['T2_Seed']
pred_lr_womens['diff_rating'] = pred_lr_womens['T1_Rating'] - pred_lr_womens['T2_Rating']
pred_lr_womens['diff_win_rate'] = pred_lr_womens['T1_win_rate'] - pred_lr_womens['T2_win_rate']
pred_lr_womens['diff_gap_avg'] = pred_lr_womens['T1_gap_avg'] - pred_lr_womens['T2_gap_avg']

### Modeling

#### fit

In [7]:
feat_cols = [
    'diff_seed',
    'diff_rating', 
    'diff_win_rate', 
    'diff_gap_avg'
]
target_col = "target"

In [8]:
X_mens = train_df_mens[feat_cols]
y_mens = train_df_mens[target_col]
X_womens = train_df_womens[feat_cols]
y_womens = train_df_womens[target_col]

lr_mens = LogisticRegression()
lr_mens.fit(X_mens, y_mens)
lr_womens = LogisticRegression()
lr_womens.fit(X_womens, y_womens)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

#### predict

In [9]:
X_test_mens = pred_lr_mens[feat_cols]
pred_lr_mens["Pred_lr"] = lr_mens.predict_proba(X_test_mens)[:,1]

X_test_womens = pred_lr_womens[feat_cols]
pred_lr_womens["Pred_lr"] = lr_womens.predict_proba(X_test_womens)[:,1]

pred_lr = pd.concat([pred_lr_mens, pred_lr_womens])

## XGB model

### Load Data

In [10]:
tourney_results = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneyDetailedResults.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneyDetailedResults.csv"),
], ignore_index=True)

seeds = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv"),
], ignore_index=True)

regular_results = pd.concat([
    pd.read_csv(DATA_PATH + "MRegularSeasonDetailedResults.csv"),
    pd.read_csv(DATA_PATH + "WRegularSeasonDetailedResults.csv"),
], ignore_index=True)

### Create Feature

In [11]:
def prepare_data(df):
    
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    
    return output

In [12]:
regular_data = prepare_data(regular_results)
tourney_data = prepare_data(tourney_results)

In [13]:
boxscore_cols = [
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_OR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_OR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk',  
        'PointDiff']

funcs = [np.mean]

In [14]:
season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs).reset_index()
season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]

season_statistics_T1 = season_statistics.copy()
season_statistics_T2 = season_statistics.copy()

season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T1.columns)]
season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T2.columns)]
season_statistics_T1.columns.values[0] = "Season"
season_statistics_T2.columns.values[0] = "Season"

In [15]:
last14days_stats_T1 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T1['win'] = np.where(last14days_stats_T1['PointDiff']>0,1,0)
last14days_stats_T1 = last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')

last14days_stats_T2 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T2['win'] = np.where(last14days_stats_T2['PointDiff']<0,1,0)
last14days_stats_T2 = last14days_stats_T2.groupby(['Season','T2_TeamID'])['win'].mean().reset_index(name='T2_win_ratio_14d')

In [16]:
regular_season_effects = regular_data[['Season','T1_TeamID','T2_TeamID','PointDiff']].copy()
regular_season_effects['T1_TeamID'] = regular_season_effects['T1_TeamID'].astype(str)
regular_season_effects['T2_TeamID'] = regular_season_effects['T2_TeamID'].astype(str)
regular_season_effects['win'] = np.where(regular_season_effects['PointDiff']>0,1,0)
march_madness = pd.merge(seeds[['Season','TeamID']],seeds[['Season','TeamID']],on='Season')
march_madness.columns = ['Season', 'T1_TeamID', 'T2_TeamID']
march_madness.T1_TeamID = march_madness.T1_TeamID.astype(str)
march_madness.T2_TeamID = march_madness.T2_TeamID.astype(str)
regular_season_effects = pd.merge(regular_season_effects, march_madness, on = ['Season','T1_TeamID','T2_TeamID'])

def team_quality(season):
    formula = 'win~-1+T1_TeamID+T2_TeamID'
    glm = sm.GLM.from_formula(formula=formula, 
                              data=regular_season_effects.loc[regular_season_effects.Season==season,:], 
                              family=sm.families.Binomial()).fit()
    
    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ['TeamID','quality']
    quality['Season'] = season
    #quality['quality'] = np.exp(quality['quality'])
    quality = quality.loc[quality.TeamID.str.contains('T1_')].reset_index(drop=True)
    quality['TeamID'] = quality['TeamID'].apply(lambda x: x[10:14]).astype(int)
    return quality

glm_quality = pd.concat([team_quality(2010),
                         team_quality(2011),
                         team_quality(2012),
                         team_quality(2013),
                         team_quality(2014),
                         team_quality(2015),
                         team_quality(2016),
                         team_quality(2017),
                         team_quality(2018),
                         team_quality(2019),
                         ##team_quality(2020),
                         team_quality(2021),
                         team_quality(2022),
                         team_quality(2023),
                         team_quality(2024)
                         ]).reset_index(drop=True)

glm_quality_T1 = glm_quality.copy()
glm_quality_T2 = glm_quality.copy()
glm_quality_T1.columns = ['T1_TeamID','T1_quality','Season']
glm_quality_T2.columns = ['T2_TeamID','T2_quality','Season']

  n_endog_mu = self._clean((1. - endog) / (1. - mu))
  t = np.exp(-z)
  endog_mu = self._clean(endog / mu)
  endog_mu = self._clean(endog / mu)
  n_endog_mu = self._clean((1. - endog) / (1. - mu))


In [17]:
seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))

seeds_T1 = seeds[['Season','TeamID','seed']].copy()
seeds_T2 = seeds[['Season','TeamID','seed']].copy()
seeds_T1.columns = ['Season','T1_TeamID','T1_seed']
seeds_T2.columns = ['Season','T2_TeamID','T2_seed']

In [18]:
tourney_data = tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score']]

tourney_data = pd.merge(tourney_data, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

tourney_data = pd.merge(tourney_data, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')

tourney_data = pd.merge(tourney_data, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

tourney_data = pd.merge(tourney_data, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')

tourney_data["Seed_diff"] = tourney_data["T1_seed"] - tourney_data["T2_seed"]
tourney_data["Score_diff"] = tourney_data["T1_Score"] - tourney_data["T2_Score"]

In [19]:
pred_xgb = pd.merge(sample_sub, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
pred_xgb = pd.merge(pred_xgb, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

pred_xgb = pd.merge(pred_xgb, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')

pred_xgb = pd.merge(pred_xgb, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

pred_xgb = pd.merge(pred_xgb, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
pred_xgb = pd.merge(pred_xgb, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')
pred_xgb = pd.merge(pred_xgb, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
pred_xgb = pd.merge(pred_xgb, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')

pred_xgb["Seed_diff"] = pred_xgb["T1_seed"] - pred_xgb["T2_seed"]

### Modeling

#### fit

In [20]:
y = tourney_data['T1_Score'] - tourney_data['T2_Score']
y.describe()

count    4284.000000
mean        0.000000
std        17.442357
min       -89.000000
25%       -11.000000
50%         0.000000
75%        11.000000
max        89.000000
dtype: float64

In [21]:
features = list(season_statistics_T1.columns[2:999]) + \
    list(season_statistics_T2.columns[2:999]) + \
    list(seeds_T1.columns[2:999]) + \
    list(seeds_T2.columns[2:999]) + \
    list(last14days_stats_T1.columns[2:999]) + \
    list(last14days_stats_T2.columns[2:999]) + \
    ["Seed_diff"] + ["T1_quality","T2_quality"]

len(features)

45

In [22]:
X = tourney_data[features].values
dtrain = xgb.DMatrix(X, label = y)

In [23]:
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess

In [24]:
param = {} 
#param['objective'] = 'reg:linear'
param['eval_metric'] =  'mae'
param['booster'] = 'gbtree'
param['eta'] = 0.02 #change to ~0.02 for final run
param['subsample'] = 0.35
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 10 #recommend 10
param['min_child_weight'] = 40
param['gamma'] = 10
param['max_depth'] =  3
param['silent'] = 1

print(param)

{'eval_metric': 'mae', 'booster': 'gbtree', 'eta': 0.02, 'subsample': 0.35, 'colsample_bytree': 0.7, 'num_parallel_tree': 10, 'min_child_weight': 40, 'gamma': 10, 'max_depth': 3, 'silent': 1}


In [25]:
xgb_cv = []
repeat_cv = 10 # recommend 10

for i in range(repeat_cv): 
    print(f"Fold repeater {i}")
    xgb_cv.append(
        xgb.cv(
          params = param,
          dtrain = dtrain,
          obj = cauchyobj,
          num_boost_round = 3000,
          folds = KFold(n_splits = 5, shuffle = True, random_state = i),
          early_stopping_rounds = 25,
          verbose_eval = 50
        )
    )

Fold repeater 0
[0]	train-mae:13.5169+0.0211428	test-mae:13.5187+0.0871531
[50]	train-mae:10.425+0.0436016	test-mae:10.5398+0.169861
[100]	train-mae:9.64251+0.0573294	test-mae:9.86613+0.205629
[150]	train-mae:9.35185+0.0613288	test-mae:9.67568+0.217672
[200]	train-mae:9.18349+0.0614544	test-mae:9.60045+0.224254
[250]	train-mae:9.05733+0.0617206	test-mae:9.5641+0.222099
[300]	train-mae:8.94805+0.0637345	test-mae:9.54415+0.220025
[350]	train-mae:8.84952+0.0654957	test-mae:9.53297+0.21965
[400]	train-mae:8.7593+0.0662267	test-mae:9.52661+0.220383
[450]	train-mae:8.67176+0.0674157	test-mae:9.52188+0.220214
[500]	train-mae:8.58891+0.0682832	test-mae:9.51686+0.223219
[550]	train-mae:8.51005+0.0679349	test-mae:9.51302+0.224102
[600]	train-mae:8.43496+0.0679175	test-mae:9.51051+0.222907
[650]	train-mae:8.36204+0.0672853	test-mae:9.50887+0.225985
Fold repeater 1
[0]	train-mae:13.5153+0.0460636	test-mae:13.5162+0.183217
[50]	train-mae:10.4224+0.0433368	test-mae:10.542+0.210342
[100]	train-mae:9.

In [26]:
iteration_counts = [np.argmin(x['test-mae-mean'].values) for x in xgb_cv]
val_mae = [np.min(x['test-mae-mean'].values) for x in xgb_cv]
iteration_counts, val_mae

([651, 530, 601, 545, 509, 548, 588, 567, 481, 394],
 [9.508835000000001,
  9.5474056,
  9.5145176,
  9.549838,
  9.5087038,
  9.5313616,
  9.4965436,
  9.481563,
  9.5719914,
  9.5301718])

In [27]:
oof_preds = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    preds = y.copy()
    kfold = KFold(n_splits = 5, shuffle = True, random_state = i)    
    for train_index, val_index in kfold.split(X,y):
        dtrain_i = xgb.DMatrix(X[train_index], label = y[train_index])
        dval_i = xgb.DMatrix(X[val_index], label = y[val_index])  
        model = xgb.train(
              params = param,
              dtrain = dtrain_i,
              num_boost_round = iteration_counts[i],
              verbose_eval = 50
        )
        preds[val_index] = model.predict(dval_i)
    oof_preds.append(np.clip(preds,-30,30))

Fold repeater 0
Fold repeater 1
Fold repeater 2
Fold repeater 3
Fold repeater 4
Fold repeater 5
Fold repeater 6
Fold repeater 7
Fold repeater 8
Fold repeater 9


In [28]:
val_cv = []
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)

#### predict

In [29]:
Xsub = pred_xgb[features].values
dtest = xgb.DMatrix(Xsub)

In [30]:
sub_models = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    sub_models.append(
        xgb.train(
          params = param,
          dtrain = dtrain,
          num_boost_round = int(iteration_counts[i] * 1.05),
          verbose_eval = 50
        )
    )

Fold repeater 0
Fold repeater 1
Fold repeater 2
Fold repeater 3
Fold repeater 4
Fold repeater 5
Fold repeater 6
Fold repeater 7
Fold repeater 8
Fold repeater 9


In [31]:
sub_preds = []
for i in range(repeat_cv):
    sub_preds.append(np.clip(spline_model[i](np.clip(sub_models[i].predict(dtest),-30,30)),0.025,0.975))
    
pred_xgb["Pred_xgb"] = pd.DataFrame(sub_preds).mean(axis=0)

## Ensemble

In [32]:
pred_lr.head()

Unnamed: 0,ID,Season,T1_TeamID,T2_TeamID,T1_Seed,T1_Rating,T1_win_rate,T1_gap_avg,T2_Seed,T2_Rating,T2_win_rate,T2_gap_avg,diff_seed,diff_rating,diff_win_rate,diff_gap_avg,Pred_lr
0,2024_1103_1104,2024,1103,1104,14.0,53.536134,0.6875,5.53125,4.0,61.954087,0.65625,9.6875,10.0,-8.417953,0.03125,-4.15625,0.183548
1,2024_1103_1112,2024,1103,1112,14.0,53.536134,0.6875,5.53125,2.0,62.355458,0.757576,15.69697,12.0,-8.819323,-0.070076,-10.16572,0.142055
2,2024_1103_1120,2024,1103,1120,14.0,53.536134,0.6875,5.53125,4.0,63.285762,0.794118,15.294118,10.0,-9.749627,-0.106618,-9.762868,0.124682
3,2024_1103_1124,2024,1103,1124,14.0,53.536134,0.6875,5.53125,3.0,62.778918,0.6875,8.8125,11.0,-9.242784,0.0,-3.28125,0.175248
4,2024_1103_1129,2024,1103,1129,14.0,53.536134,0.6875,5.53125,10.0,58.720829,0.666667,6.4,4.0,-5.184694,0.020833,-0.86875,0.296161


In [33]:
pred_xgb.head()

Unnamed: 0,ID,Season,T1_TeamID,T2_TeamID,T1_FGMmean,T1_FGAmean,T1_FGM3mean,T1_FGA3mean,T1_ORmean,T1_Astmean,T1_TOmean,T1_Stlmean,T1_PFmean,T1_opponent_FGMmean,T1_opponent_FGAmean,T1_opponent_FGM3mean,T1_opponent_FGA3mean,T1_opponent_ORmean,T1_opponent_Astmean,T1_opponent_TOmean,T1_opponent_Stlmean,T1_opponent_Blkmean,T1_PointDiffmean,T2_FGMmean,T2_FGAmean,T2_FGM3mean,T2_FGA3mean,T2_ORmean,T2_Astmean,T2_TOmean,T2_Stlmean,T2_PFmean,T2_opponent_FGMmean,T2_opponent_FGAmean,T2_opponent_FGM3mean,T2_opponent_FGA3mean,T2_opponent_ORmean,T2_opponent_Astmean,T2_opponent_TOmean,T2_opponent_Stlmean,T2_opponent_Blkmean,T2_PointDiffmean,T1_quality,T2_quality,T1_seed,T2_seed,T1_win_ratio_14d,T2_win_ratio_14d,Seed_diff,Pred_xgb
0,2024_1103_1104,2024,1103,1104,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,10.8125,5.34375,16.96875,24.53125,56.9375,6.28125,20.9375,7.46875,11.125,10.3125,6.0,2.5625,5.53125,30.90625,64.75,11.0625,30.28125,11.03125,15.90625,11.8125,7.25,19.875,27.625,62.6875,7.34375,23.03125,9.6875,12.40625,11.28125,7.40625,4.3125,9.6875,3920893000000000.0,4691567000000000.0,14,4,0.6,0.333333,10,0.203458
1,2024_1103_1112,2024,1103,1112,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,10.8125,5.34375,16.96875,24.53125,56.9375,6.28125,20.9375,7.46875,11.125,10.3125,6.0,2.5625,5.53125,31.545455,64.515152,7.818182,21.060606,11.545455,18.575758,11.757576,8.272727,16.333333,26.424242,62.484848,7.969697,23.848485,7.151515,13.909091,12.848485,7.030303,3.757576,15.69697,3920893000000000.0,6270864000000000.0,14,2,0.6,0.5,12,0.025055
2,2024_1103_1120,2024,1103,1120,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,10.8125,5.34375,16.96875,24.53125,56.9375,6.28125,20.9375,7.46875,11.125,10.3125,6.0,2.5625,5.53125,28.911765,60.823529,8.029412,22.794118,9.794118,17.911765,10.147059,7.411765,19.205882,22.352941,58.117647,5.764706,19.352941,10.029412,9.558824,12.0,6.558824,3.235294,15.294118,3920893000000000.0,5584455000000000.0,14,4,0.6,1.0,10,0.148093
3,2024_1103_1124,2024,1103,1124,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,10.8125,5.34375,16.96875,24.53125,56.9375,6.28125,20.9375,7.46875,11.125,10.3125,6.0,2.5625,5.53125,27.25,56.75,8.5625,21.75,9.40625,14.6875,11.375,6.375,16.8125,25.8125,56.90625,6.78125,20.21875,7.5625,13.65625,10.90625,7.0,3.4375,8.8125,3920893000000000.0,4903973000000000.0,14,3,0.6,0.5,11,0.148767
4,2024_1103_1129,2024,1103,1129,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,10.8125,5.34375,16.96875,24.53125,56.9375,6.28125,20.9375,7.46875,11.125,10.3125,6.0,2.5625,5.53125,25.533333,57.133333,7.933333,22.766667,9.533333,11.966667,10.4,5.766667,16.833333,24.266667,54.933333,6.2,20.266667,6.433333,11.533333,10.633333,5.733333,3.9,6.4,3920893000000000.0,6012250000000000.0,14,10,0.6,0.333333,4,0.329735


In [34]:
preds = pred_lr.copy()
preds = pd.merge(preds, pred_xgb[['ID', 'Pred_xgb']], on = "ID", how = "left")
preds['Pred'] = 0.2*preds['Pred_lr'] + 0.8*preds['Pred_xgb']

#Connecticut win
preds['Pred'] = preds['Pred'].where(preds['T1_TeamID'] != 1163, 1)
preds['Pred'] = preds['Pred'].where(preds['T2_TeamID'] != 1163, 0)

#South Carolina win
preds['Pred'] = preds['Pred'].where(preds['T1_TeamID'] != 3376, 1)
preds['Pred'] = preds['Pred'].where(preds['T2_TeamID'] != 3376, 0)

preds = preds[['ID', 'Pred']]
preds.tail()

Unnamed: 0,ID,Pred
4027,2024_3439_3453,0.642006
4028,2024_3439_3465,0.93726
4029,2024_3452_3453,0.61803
4030,2024_3452_3465,0.887086
4031,2024_3453_3465,0.811165


## Simulate

https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets/notebook

In [35]:
round_slots = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/MNCAATourneySlots.csv')
round_slots = round_slots[round_slots['Season'] == 2024]
round_slots = round_slots[round_slots['Slot'].str.contains('R')]

seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv')
seeds_m = seeds[seeds['Tournament'] == 'M']
seeds_w = seeds[seeds['Tournament'] == 'W']

preds['ID'] = preds['ID'].str.split('_')

In [36]:
def prepare_data(seeds, preds):
    # Function preparing the data for the simulation
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    probas_dict = {}
    
    for teams, proba in zip(preds['ID'], preds['Pred']):
        team1, team2 = teams[1], teams[2]

        probas_dict.setdefault(team1, {})[team2] = proba
        probas_dict.setdefault(team2, {})[team1] = 1 - proba

    return seed_dict, inverted_seed_dict, probas_dict


def simulate(round_slots, seeds, inverted_seeds, probas, random_values, sim=True):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - random_values (array-like): Array with precomputed random-values.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak, random_val in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed, random_values):
        team1, team2 = seeds[strong], seeds[weak]

        # Get the probability of team_1 winning
        proba = probas[str(team1)][str(team2)]
            
        if sim:
            # Randomly determine the winner based on the probability
            winner = team1 if random_val < proba else team2
        else:
            # Determine the winner based on the higher probability
            winner = [team1, team2][np.argmax([proba, 1-proba])]
            
        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    # Convert winners to original seeds using the inverted_seeds dictionary
    return [inverted_seeds[w] for w in winners], slots


def run_simulation(brackets=1, seeds=None, preds=None, round_slots=None, sim=True):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict, probas_dict = prepare_data(seeds, preds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []
    
    # Precompute random-values
    random_values = np.random.random(size=(brackets, len(round_slots)))

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets+1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, probas_dict, random_values[b-1], sim)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [37]:
n_brackets = 100000
result_m=run_simulation(brackets=n_brackets, seeds=seeds_m, preds=preds, round_slots=round_slots, sim=True)
result_m['Tournament'] = 'M'
result_w=run_simulation(brackets=n_brackets, seeds=seeds_w, preds=preds, round_slots=round_slots, sim=True)
result_w['Tournament'] = 'W'
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

100%|██████████| 100000/100000 [00:17<00:00, 5650.34it/s]
100%|██████████| 100000/100000 [00:18<00:00, 5474.68it/s]


In [38]:
#Probability of winning（mens)
submission.query('Slot == "R6CH" & Tournament == "M"').groupby('Team').count().sort_values('Bracket', ascending=False).head(20)

Unnamed: 0_level_0,Bracket,Slot,Tournament
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
W01,100000,100000,100000


In [39]:
#Probability of winning（womens)
submission.query('Slot == "R6CH" & Tournament == "W"').groupby('Team').count().sort_values('Bracket', ascending=False).head(20)

Unnamed: 0_level_0,Bracket,Slot,Tournament
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
W01,100000,100000,100000


In [40]:
submission.to_csv('submission.csv')
submission

Unnamed: 0_level_0,Bracket,Slot,Team,Tournament
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,R1W1,W01,M
1,1,R1W2,W02,M
2,1,R1W3,W03,M
3,1,R1W4,W04,M
4,1,R1W5,W05,M
5,1,R1W6,W11,M
6,1,R1W7,W07,M
7,1,R1W8,W09,M
8,1,R1X1,X01,M
9,1,R1X2,X02,M
