In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

## Historically based prediction

In [4]:
def build_results(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    return pd.concat(csvs)

In [5]:
results_m = build_results('M')
results_w = build_results('W')

display(results_m)
display(results_w)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
185860,2024,100,1424,67,1201,65,A,0
185861,2024,100,1429,84,1461,76,A,0
185862,2024,100,1454,71,1156,68,A,0
185863,2024,100,1459,73,1273,60,A,0


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0
...,...,...,...,...,...,...,...,...
130230,2024,100,3409,74,3427,70,H,0
130231,2024,100,3424,67,3161,64,H,0
130232,2024,100,3444,65,3296,47,A,0
130233,2024,100,3457,54,3347,51,H,1


In [6]:
def winner(ids):
    id, wId, lId = ids

    return int(id == wId)

def opponent(x):
    winInt, wId, lId = x
    win = not winInt
    
    return wId if win else lId

def score_diff(x):
    winInt, wScore, lScore = x
    win = not winInt
    
    return (wScore - lScore) if win else (lScore - wScore)

def build_season_results(df):
    season_results = df
    season_results['TeamID'] = season_results[['WTeamID', 'LTeamID']].values.tolist()
    season_results = season_results.explode('TeamID')
    season_results['Win'] = season_results[['TeamID', 'WTeamID', 'LTeamID']].apply(winner, axis=1)
    season_results['Defeat'] = season_results['Win'].apply(lambda x: 1 - x)
    season_results['Games'] = season_results['Win'] + season_results['Defeat']
    season_results['ScoreDiff'] = season_results[['Win', 'WScore', 'LScore']].apply(score_diff, axis=1)
    season_results['OTeamID'] = season_results[['Win', 'WTeamID', 'LTeamID']].apply(opponent, axis=1)
    season_results['Home'] = season_results['WLoc'].apply(lambda x: int(x[0] == 'H'))
    season_results = season_results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc'], axis=1)
    season_results = season_results.groupby(by=['TeamID', 'OTeamID']).sum()
    season_results['WinRatio'] = season_results['Win'] / season_results['Games']
    season_results = season_results.drop(['Win', 'Defeat'], axis=1)

    return season_results

In [7]:
season_results_m = build_season_results(results_m)
season_results_w = build_season_results(results_w)

display(season_results_m)
display(season_results_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,NumOT,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1101,1102,0,1,-4,0,1.0
1101,1115,0,1,-8,0,1.0
1101,1116,0,2,23,2,0.0
1101,1117,0,2,-7,2,0.5
1101,1122,0,1,-8,0,1.0
...,...,...,...,...,...,...
1478,1384,0,1,-37,1,1.0
1478,1437,0,1,26,1,0.0
1478,1447,0,1,23,1,0.0
1478,1467,0,2,20,1,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,NumOT,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3101,3102,0,1,-36,1,1.0
3101,3106,0,1,-11,1,1.0
3101,3114,0,1,-5,0,1.0
3101,3116,0,2,26,1,0.0
3101,3117,0,1,-12,0,1.0
...,...,...,...,...,...,...
3478,3425,0,1,51,1,0.0
3478,3433,0,1,23,1,0.0
3478,3447,0,1,-11,1,1.0
3478,3467,0,2,-22,1,1.0


In [8]:
def build_teams(gender):
    teams = CSV["{}Teams".format(gender)]
    teams = teams.drop('TeamName', axis=1)
    teams = teams.set_index('TeamID')
    
    return teams

In [9]:
teams_m = build_teams('M')
teams_w = build_teams('W') # FIXME: Maybe useless since there is no data aside from TeamName.

display(teams_m)
display(teams_w)

Unnamed: 0_level_0,FirstD1Season,LastD1Season
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1
1101,2014,2024
1102,1985,2024
1103,1985,2024
1104,1985,2024
1105,2000,2024
...,...,...
1474,2023,2024
1475,2023,2024
1476,2023,2024
1477,2023,2024


3101
3102
3103
3104
3105
...
3474
3475
3476
3477
3478


In [10]:
def clean_seeds(seed):
    res = seed[1:]

    if len(res) > 2:
        res = res[:-1]

    return int(res)

def build_seeds(gender):
    seeds = CSV["{}NCAATourneySeeds".format(gender)] 
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)
    seeds = seeds.drop('Season', axis=1)
    seeds = seeds.groupby(by='TeamID').mean()
    
    return seeds

In [11]:
seeds_m = build_seeds('M')
seeds_w = build_seeds('W')

display(seeds_m)
display(seeds_w)

Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
1101,14.500000
1102,12.000000
1103,13.600000
1104,5.894737
1105,16.000000
...,...
1459,12.200000
1460,15.000000
1461,10.800000
1462,8.074074


Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
3101,16.000000
3103,13.000000
3104,6.000000
3106,15.333333
3107,14.285714
...,...
3458,7.000000
3460,13.333333
3461,12.500000
3462,5.888889


In [12]:
def build_rankings(gender):
    rankings = CSV["{}MasseyOrdinals".format(gender)]
    rankings = rankings.drop(['SystemName', 'RankingDayNum'], axis=1)
    rankings = rankings.groupby(by='TeamID').mean()
    rankings = rankings.drop('Season', axis=1)

    return rankings

In [13]:
rankings_m = build_rankings('M')

rankings_m

Unnamed: 0_level_0,OrdinalRank
TeamID,Unnamed: 1_level_1
1101,231.948429
1102,183.409557
1103,111.840219
1104,55.914665
1105,319.892673
...,...
1474,224.839731
1475,284.956931
1476,326.853180
1477,292.496635


In [14]:
def build_history(season_results, seeds, teams, rankings=None):
    history = season_results.join(teams, on='TeamID').join(seeds, on='TeamID')
    
    if rankings is not None:
        history = history.join(rankings, on='TeamID')
    
    return history

In [15]:
history_m = build_history(season_results_m, seeds_m, teams_m, rankings_m)
history_w = build_history(season_results_w, seeds_w, teams_w)

display(history_m)
display(history_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,NumOT,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Seed,OrdinalRank
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1101,1102,0,1,-4,0,1.0,2014,2024,14.5,231.948429
1101,1115,0,1,-8,0,1.0,2014,2024,14.5,231.948429
1101,1116,0,2,23,2,0.0,2014,2024,14.5,231.948429
1101,1117,0,2,-7,2,0.5,2014,2024,14.5,231.948429
1101,1122,0,1,-8,0,1.0,2014,2024,14.5,231.948429
...,...,...,...,...,...,...,...,...,...,...
1478,1384,0,1,-37,1,1.0,2024,2024,,322.377966
1478,1437,0,1,26,1,0.0,2024,2024,,322.377966
1478,1447,0,1,23,1,0.0,2024,2024,,322.377966
1478,1467,0,2,20,1,0.0,2024,2024,,322.377966


Unnamed: 0_level_0,Unnamed: 1_level_0,NumOT,Games,ScoreDiff,Home,WinRatio,Seed
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3101,3102,0,1,-36,1,1.0,16.0
3101,3106,0,1,-11,1,1.0,16.0
3101,3114,0,1,-5,0,1.0,16.0
3101,3116,0,2,26,1,0.0,16.0
3101,3117,0,1,-12,0,1.0,16.0
...,...,...,...,...,...,...,...
3478,3425,0,1,51,1,0.0,
3478,3433,0,1,23,1,0.0,
3478,3447,0,1,-11,1,1.0,
3478,3467,0,2,-22,1,1.0,


In [16]:
def score_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': 'cpu',
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, show_progress_bar=True)

    return study.best_params

In [17]:
def build_x_y(df):
    target_column = 'WinRatio'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [18]:
X_m, y_m = build_x_y(history_m)
X_w, y_w = build_x_y(history_w)

In [19]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [20]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))

In [21]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007911 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 865
[LightGBM] [Info] Number of data points in the train set: 41021, number of used features: 8
[LightGBM] [Info] Start training from score 0.498742
LightGBM Model accuracy score: 0.9133
LightGBM Model accuracy score [train]: 0.9170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 495
[LightGBM] [Info] Number of data points in the train set: 30773, number of used features: 5
[LightGBM] [Info] Start training from score 0.497568
LightGBM Model accuracy score: 0.9311
LightGBM Model accuracy score [train]: 0.9325


# Prediction

In [22]:
def build_wins(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)

    wins = X
    wins['WinRatio'] = reg.predict(X)
    wins = wins[['WinRatio']]

    return wins

In [23]:
wins_m = build_wins(X_m, y_m, params_m)
wins_w = build_wins(X_w, y_w, params_w)

display(wins_m)
display(wins_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 864
[LightGBM] [Info] Number of data points in the train set: 58602, number of used features: 8
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 43962, number of used features: 5
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,0.918382
1101,1115,0.918382
1101,1116,0.213628
1101,1117,0.535945
1101,1122,0.918382
...,...,...
1478,1384,0.917569
1478,1437,0.075027
1478,1447,0.075027
1478,1467,0.150782


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,0.932730
3101,3106,0.932730
3101,3114,0.932730
3101,3116,0.089506
3101,3117,0.932730
...,...,...
3478,3425,0.066036
3478,3433,0.066036
3478,3447,0.932730
3478,3467,0.908840


In [24]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    slots = slots[slots['Season'] == 2023]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [25]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [26]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [27]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [28]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, wins, debug):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - wins (DF): DF that includes wins prediction per matchup.
    - debug (dict): Debug info.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        winner = None
        
        try:
            team_1_prob = wins.loc[team_1, team_2].WinRatio
            
            debug['found'] += 1
            winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])
        except KeyError:
            debug['miss'] += 1
            winner = np.random.choice([team_1, team_2])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        debug['count'] += 1
        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, wins, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []
    
    debug = {
        'count': 0,
        'miss': 0,
        'found': 0
    }

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, wins, debug)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)
        
    print("Found %: {}%".format(debug['found'] * 100.0 / debug['count']))
    print("Not found %: {}%".format(debug['miss'] * 100.0 / debug['count']))
    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [29]:
n_brackets = 5000
result_m = run_simulation(seeds_2024_m, slots_m, wins_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, wins_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 5000/5000 [01:07<00:00, 73.94it/s]


Found %: 89.12984126984126%
Not found %: 10.87015873015873%


100%|██████████| 5000/5000 [01:09<00:00, 72.45it/s]


Found %: 88.10349206349207%
Not found %: 11.896507936507936%


In [30]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
629995,W,5000,R4Y1,Y06
629996,W,5000,R4Z1,Z02
629997,W,5000,R5WX,W03
629998,W,5000,R5YZ,Z02


In [31]:
submission.to_csv('submission.csv')

# Resources
- https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.