<a href="https://www.kaggle.com/code/tcordeu/march-madness-2024?scriptVersionId=167012350" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import linregress
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)



In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

In [4]:
def device():
    from tensorflow.python.client import device_lib

    return 'gpu' if len(list(filter(lambda x: x.device_type == 'GPU', device_lib.list_local_devices()))) > 0 else 'cpu'

In [5]:
print("Device: {}".format(device()))

2024-03-14 15:22:02.368003: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 15:22:02.368095: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 15:22:02.484044: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: gpu


# Build DF

## Feature Engineering

In [6]:
def build_results(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    return pd.concat(csvs)

In [7]:
results_m = build_results('M')
results_w = build_results('W')

display(results_m)
display(results_w)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
186547,2024,114,1454,75,1237,70,A,0
186548,2024,114,1455,74,1412,66,A,0
186549,2024,114,1459,91,1359,69,H,0
186550,2024,114,1462,91,1177,58,H,0


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0
...,...,...,...,...,...,...,...,...
130890,2024,114,3409,76,3396,67,A,0
130891,2024,114,3424,63,3129,57,H,0
130892,2024,114,3433,69,3348,59,A,0
130893,2024,114,3453,70,3236,61,A,0


In [8]:
def build_pythagorean_expectation(results):
    py = results[['Season', 'WTeamID', 'LTeamID', 'WScore', 'LScore']].copy()
    py = py.rename(columns={'WTeamID': 'TeamID', 'LTeamID': 'OTeamID', 'WScore': 'Score', 'LScore': 'OScore'})
    py = py.groupby(['Season', 'TeamID', 'OTeamID']).sum()
    py['PyWin'] = (py['Score'] ** 13.91) / (py['Score'] ** 13.91 + py['OScore'] ** 13.91)
    py = py[['PyWin']].groupby(['TeamID', 'OTeamID']).mean()
    
    return py

In [9]:
py_m = build_pythagorean_expectation(results_m)
py_w = build_pythagorean_expectation(results_w)

display(py_m)
display(py_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,PyWin
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,0.716604
1101,1115,0.847376
1101,1117,0.971166
1101,1122,0.812382
1101,1132,0.692894
...,...,...
1478,1171,0.995795
1478,1192,0.903658
1478,1254,0.904757
1478,1384,0.999050


Unnamed: 0_level_0,Unnamed: 1_level_0,PyWin
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,0.999755
3101,3106,0.892309
3101,3114,0.749480
3101,3117,0.902943
3101,3146,0.843471
...,...,...
3478,3262,0.610342
3478,3357,0.723060
3478,3447,0.954434
3478,3467,0.918014


In [10]:
def build_teams(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams.drop('TeamName', axis=1)
    teams = teams.set_index('TeamID')
    
    return teams

In [11]:
teams_m = build_teams('M')
teams_w = build_teams('W') # FIXME: Maybe useless since there is no data aside from TeamName.

display(teams_m)
display(teams_w)

Unnamed: 0_level_0,FirstD1Season,LastD1Season
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1
1101,2014,2024
1102,1985,2024
1103,1985,2024
1104,1985,2024
1105,2000,2024
...,...,...
1474,2023,2024
1475,2023,2024
1476,2023,2024
1477,2023,2024


3101
3102
3103
3104
3105
...
3474
3475
3476
3477
3478


In [12]:
def calculate_elo(teams, data, initial_rating=2000, k=140, alpha=None):
    '''
    Calculate Elo ratings for each team based on match data.

    Parameters:
    - teams (array-like): Containing Team-IDs.
    - data (pd.DataFrame): DataFrame with all matches in chronological order.
    - initial_rating (float): Initial rating of an unranked team (default: 2000).
    - k (float): K-factor, determining the impact of each match on team ratings (default: 140).
    - alpha (float or None): Tuning parameter for the multiplier for the margin of victory. No multiplier if None.

    Returns: 
    - list: Historical ratings of the winning team (WTeam).
    - list: Historical ratings of the losing team (LTeam).
    '''
    
    # Dictionary to keep track of current ratings for each team
    team_dict = {}
    for team in teams:
        team_dict[team] = initial_rating
        
    # Lists to store ratings for each team in each game
    r1, r2 = [], []
    margin_of_victory = 1

    # Iterate through the game data
    for wteam, lteam, ws, ls  in tqdm(zip(data.WTeamID, data.LTeamID, data.WScore, data.LScore), total=len(data)):
        # Append current ratings for teams to lists
        r1.append(team_dict[wteam])
        r2.append(team_dict[lteam])

        # Calculate expected outcomes based on Elo ratings
        rateW = 1 / (1 + 10 ** ((team_dict[lteam] - team_dict[wteam]) / initial_rating))
        rateL = 1 / (1 + 10 ** ((team_dict[wteam] - team_dict[lteam]) / initial_rating))
        
        if alpha:
            margin_of_victory = (ws - ls)/alpha

        # Update ratings for winning and losing teams
        team_dict[wteam] += k * margin_of_victory * (1 - rateW)
        team_dict[lteam] += k * margin_of_victory * (0 - rateL)

        # Ensure that ratings do not go below 1
        if team_dict[lteam] < 1:
            team_dict[lteam] = 1
        
    return r1, r2

def create_elo_data(teams, data, initial_rating=2000, k=140, alpha=None):
    '''
    Create a DataFrame with summary statistics of Elo ratings for teams based on historical match data.

    Parameters:
    - teams (array-like): Containing Team-IDs.
    - data (pd.DataFrame): DataFrame with all matches in chronological order.
    - initial_rating (float): Initial rating of an unranked team (default: 2000).
    - k (float): K-factor, determining the impact of each match on team ratings (default: 140).

    Returns: 
    - DataFrame: Summary statistics of Elo ratings for teams throughout a season.
    '''
    
    r1, r2 = calculate_elo(teams, data, initial_rating, k, alpha)
    
    # Concatenate arrays vertically
    seasons = np.concatenate([data.Season, data.Season])
    days = np.concatenate([data.DayNum, data.DayNum])
    teams = np.concatenate([data.WTeamID, data.LTeamID])
    tourney = np.concatenate([data.tourney, data.tourney])
    ratings = np.concatenate([r1, r2])
    # Create a DataFrame
    rating_df = pd.DataFrame({
        'Season': seasons,
        'DayNum': days,
        'TeamID': teams,
        'Rating': ratings,
        'Tourney': tourney
    })

    # Sort DataFrame and remove tournament data
    rating_df.sort_values(['TeamID', 'Season', 'DayNum'], inplace=True)
    rating_df = rating_df[rating_df['Tourney'] == 0]
    grouped = rating_df.groupby(['TeamID', 'Season'])
    results = grouped['Rating'].agg(['mean', 'median', 'std', 'min', 'max', 'last'])
    results.columns = ['Rating_Mean', 'Rating_Median', 'Rating_Std', 'Rating_Min', 'Rating_Max', 'Rating_Last']
    results['Rating_Trend'] = grouped.apply(lambda x: linregress(range(len(x)), x['Rating']).slope, include_groups=False)
    results.reset_index(inplace=True)
    
    return results

In [13]:
def build_elo(gender, results, teams):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))

    tourneys = results.copy()
    tourneys['tourney'] = 0
    tourneys.loc[len(csvs[0]):, 'tourney'] = 1
    tourneys = tourneys.sort_values(['Season', 'DayNum'])
    
    return create_elo_data(teams.reset_index().TeamID, tourneys).drop('Season', axis=1).groupby('TeamID').mean()

In [14]:
elo_m = build_elo('M', results_m, teams_m)
elo_w = build_elo('W', results_w, teams_w)

display(elo_m)
display(elo_w)

100%|██████████| 189003/189003 [00:00<00:00, 307641.93it/s]
  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
100%|██████████| 132478/132478 [00:00<00:00, 308266.60it/s]
  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


Unnamed: 0_level_0,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1101,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628
1102,2678.124707,2686.604784,214.613224,2561.394128,2763.590190,2561.394128,-43.897197
1103,2750.753871,2747.839316,139.884844,2711.145350,2786.167517,2745.404935,-19.813962
1104,3752.543112,3749.593246,59.858238,3704.294710,3804.856073,3804.856073,52.887944
1105,1108.014030,1108.014030,,1108.014030,1108.014030,1108.014030,
...,...,...,...,...,...,...,...
1460,2521.630842,2521.630842,36.888624,2515.109793,2528.151891,2528.151891,52.168392
1461,2996.400014,2993.142619,74.402542,2956.520610,3036.892349,3035.401649,64.841331
1462,3673.402601,3672.555650,55.840404,3635.467714,3712.211378,3709.733564,55.285432
1463,2585.593927,2586.981613,73.491924,2542.662895,2630.514606,2588.214276,59.662786


Unnamed: 0_level_0,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3101,2082.192135,2082.192135,,2082.192135,2082.192135,2082.192135,
3102,1984.508622,1988.790357,50.106277,1922.191344,2067.621037,1925.042554,-5.185493
3103,2398.798202,2411.812358,76.239688,2320.285649,2447.995900,2396.330833,-9.812588
3104,3343.228764,3303.620011,139.048957,3268.246483,3529.050165,3529.050165,61.916965
3106,1703.142594,1706.371227,54.592245,1687.099354,1725.899390,1687.099354,1.732950
...,...,...,...,...,...,...,...
3460,2693.141781,2685.545374,99.721223,2641.067568,2762.861693,2678.396161,45.100111
3461,2702.648620,2699.856369,82.784696,2658.883327,2745.437872,2678.542297,-26.319625
3462,3642.380183,3641.255421,57.088424,3611.791013,3676.888721,3669.153178,43.484173
3463,1964.640460,1997.180859,78.850412,1816.791878,2070.000000,1898.964063,-19.185057


In [15]:
def winner(ids):
    id, wId, lId = ids

    return int(id == wId)

def opponent(x):
    winInt, wId, lId = x
    win = not winInt
    
    return wId if win else lId

def score_diff(x):
    winInt, wScore, lScore = x
    win = not winInt
    
    return (wScore - lScore) if win else (lScore - wScore)

def build_season_results(df):
    season_results = df
    season_results['TeamID'] = season_results[['WTeamID', 'LTeamID']].values.tolist()
    season_results = season_results.explode('TeamID')
    season_results['Win'] = season_results[['TeamID', 'WTeamID', 'LTeamID']].apply(winner, axis=1)
    season_results['Defeat'] = season_results['Win'].apply(lambda x: 1 - x)
    season_results['Games'] = season_results['Win'] + season_results['Defeat']
    season_results['ScoreDiff'] = season_results[['Win', 'WScore', 'LScore']].apply(score_diff, axis=1)
    season_results['OTeamID'] = season_results[['Win', 'WTeamID', 'LTeamID']].apply(opponent, axis=1)
    season_results['Home'] = season_results['WLoc'].apply(lambda x: int(x[0] == 'H'))
    season_results = season_results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc', 'NumOT'], axis=1)
    season_results = season_results.groupby(by=['TeamID', 'OTeamID']).sum()
    season_results['WinRatio'] = season_results['Win'] / season_results['Games']
    season_results = season_results.drop(['Win', 'Defeat'], axis=1)

    return season_results

In [16]:
season_results_m = build_season_results(results_m)
season_results_w = build_season_results(results_w)

display(season_results_m)
display(season_results_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1101,1102,1,-4,0,1.0
1101,1115,1,-8,0,1.0
1101,1116,2,23,2,0.0
1101,1117,2,-7,2,0.5
1101,1122,1,-8,0,1.0
...,...,...,...,...,...
1478,1384,1,-37,1,1.0
1478,1437,1,26,1,0.0
1478,1447,1,23,1,0.0
1478,1467,2,20,1,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3101,3102,1,-36,1,1.0
3101,3106,1,-11,1,1.0
3101,3114,1,-5,0,1.0
3101,3116,2,26,1,0.0
3101,3117,1,-12,0,1.0
...,...,...,...,...,...
3478,3425,1,51,1,0.0
3478,3433,1,23,1,0.0
3478,3447,1,-11,1,1.0
3478,3467,2,-22,1,1.0


In [17]:
def build_rpi(results):
    win_pct = results.copy()[['WinRatio']]
    win_pct = win_pct.groupby('TeamID').mean()
    win_pct['WP'] = win_pct['WinRatio'] * 100
    win_pct = win_pct.drop('WinRatio', axis=1)
    
    rpi = results.copy().reset_index()
    rpi = pd.merge(rpi, win_pct, on=['TeamID'])
    rpi = pd.merge(rpi, win_pct, left_on=['OTeamID'], right_on=['TeamID'], suffixes=('_T', '_O'))
    
    wp_oo = rpi[['TeamID', 'WP_O']].groupby('TeamID').mean()
    wp_oo = wp_oo.rename(columns={'WP_O': 'WP_OO'})
    
    rpi = pd.merge(rpi, wp_oo, left_on=['OTeamID'], right_on=['TeamID'])

    rpi['RPI'] = (rpi['WP_T'] * 0.25) + (rpi['WP_O'] * 0.50) + (rpi['WP_OO'] * 0.25)
    
    return rpi[['TeamID', 'OTeamID', 'RPI']].set_index(['TeamID', 'OTeamID'])

In [18]:
rpi_m = build_rpi(season_results_m)
rpi_w = build_rpi(season_results_w)

display(rpi_m)
display(rpi_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,RPI
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,45.734030
1101,1115,30.591411
1101,1116,61.795097
1101,1117,45.884441
1101,1122,41.329277
...,...,...
1478,1384,32.692573
1478,1437,59.590598
1478,1447,38.022702
1478,1467,36.846500


Unnamed: 0_level_0,Unnamed: 1_level_0,RPI
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,36.701758
3101,3106,36.314599
3101,3114,52.287270
3101,3116,64.288236
3101,3117,48.623450
...,...,...
3478,3425,57.041476
3478,3433,50.098405
3478,3447,32.217247
3478,3467,38.531362


In [19]:
def clean_seeds(seed):
    res = seed[1:]

    if len(res) > 2:
        res = res[:-1]

    return int(res)

def build_seeds(gender):
    seeds = CSV["{}NCAATourneySeeds".format(gender)] 
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)
    seeds = seeds.drop('Season', axis=1)
    seeds = seeds.groupby(by='TeamID').mean()
    
    return seeds

In [20]:
seeds_m = build_seeds('M')
seeds_w = build_seeds('W')

display(seeds_m)
display(seeds_w)

Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
1101,14.500000
1102,12.000000
1103,13.600000
1104,5.894737
1105,16.000000
...,...
1459,12.200000
1460,15.000000
1461,10.800000
1462,8.074074


Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
3101,16.000000
3103,13.000000
3104,6.000000
3106,15.333333
3107,14.285714
...,...
3458,7.000000
3460,13.333333
3461,12.500000
3462,5.888889


In [21]:
def build_rankings(gender):
    rankings = CSV["{}MasseyOrdinals".format(gender)]
    rankings = rankings.drop(['SystemName', 'RankingDayNum'], axis=1)
    rankings = rankings.groupby(by='TeamID').mean()
    rankings = rankings.drop('Season', axis=1)

    return rankings

In [22]:
rankings_m = build_rankings('M')

rankings_m

Unnamed: 0_level_0,OrdinalRank
TeamID,Unnamed: 1_level_1
1101,231.982452
1102,183.893379
1103,111.715467
1104,55.569038
1105,319.997840
...,...
1474,227.867379
1475,288.027638
1476,328.846591
1477,294.518844


In [23]:
def build_history(season_results, seeds, teams, elo, rpi, py, rankings=None):
    history = season_results.join(teams, on='TeamID').join(seeds, on='TeamID').join(elo, on='TeamID').join(rpi, on=['TeamID', 'OTeamID']).join(py, on=['TeamID', 'OTeamID'])
    history = history.reset_index()
    history = pd.merge(history, rpi.reset_index().rename(columns={'TeamID': 'OTeamID', 'OTeamID': 'TeamID'}), on=['TeamID', 'OTeamID'], suffixes=('_T', '_O'))
    history = pd.merge(history, py.reset_index().rename(columns={'TeamID': 'OTeamID', 'OTeamID': 'TeamID'}), on=['TeamID', 'OTeamID'], suffixes=('_T', '_O'))
    history = pd.merge(history, seeds, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
    history['PyWinDiff'] = history['PyWin_T'] - history['PyWin_O']
    history['RPIDiff'] = history['RPI_T'] - history['RPI_O']
    history['SeedDiff'] = history['Seed_T'] - history['Seed_O']
    history = history.drop(['PyWin_T', 'PyWin_O', 'RPI_T', 'RPI_O', 'Seed_T', 'Seed_O'], axis=1)

    if rankings is not None:
        history = history.join(rankings, on='TeamID')
        history = pd.merge(history, rankings, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
        history['RankingsDiff'] = history['OrdinalRank_T'] - history['OrdinalRank_O']
        history = history.drop(['OrdinalRank_T', 'OrdinalRank_O'], axis=1)
    
    return history.set_index(['TeamID', 'OTeamID']).fillna(0)

In [24]:
history_m = build_history(season_results_m, seeds_m, teams_m, elo_m, rpi_m, py_m, rankings_m)
history_w = build_history(season_results_w, seeds_w, teams_w, elo_w, rpi_w, py_w)

display(history_m)
display(history_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff,RankingsDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1101,1116,2,23,2,0.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.000000,9.868280,8.239130,172.308232
1101,1117,2,-7,2,0.5,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.042283,1.778062,-0.500000,21.451895
1101,1129,1,44,1,0.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.000000,5.064541,2.250000,135.022894
1101,1149,2,-2,0,0.5,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.069522,-3.807412,-0.500000,-35.708129
1101,1161,1,8,1,0.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.000000,6.668335,4.666667,119.296315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1336,1,17,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.669366,0.000000,219.122509
1478,1360,1,9,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.703788,0.000000,134.127209
1478,1364,1,24,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.121269,0.000000,168.999021
1478,1437,1,26,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,14.971577,0.000000,290.879073


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3101,3116,2,26,1,0.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.000000,7.371887,8.375000
3101,3124,2,102,2,0.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.000000,10.647478,13.333333
3101,3146,15,-16,7,0.533333,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.002199,-3.264970,2.000000
3101,3194,1,4,0,0.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.000000,-3.549650,0.000000
3101,3195,2,55,1,0.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.000000,5.894032,4.444444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3357,2,14,0,0.500000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,-0.257831,1.149985,0.000000
3478,3384,1,3,1,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,-1.107988,0.000000
3478,3392,1,21,0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,3.097844,0.000000
3478,3425,1,51,1,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,9.916086,0.000000


In [25]:
def build_avg(history):
    agg = {}
    for col in history.columns:
        if col == 'Games' or col == 'Home':
            agg[col] = 'sum'
        else:
            agg[col] = 'mean'
    
    avg = history.groupby('TeamID').agg(agg)
    
    return avg

In [26]:
avg_m = build_avg(history_m)
avg_w = build_avg(history_w)

display(avg_m)
display(avg_w)

Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff,RankingsDiff
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1101,182,16.000000,114,0.144307,2014.0,2024.0,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.013082,3.975460,3.384439,81.486596
1102,912,55.653465,524,0.229784,1985.0,2024.0,2678.124707,2686.604784,214.613224,2561.394128,2763.590190,2561.394128,-43.897197,0.004713,2.608911,0.987400,35.315066
1103,865,-8.676768,533,0.330468,1985.0,2024.0,2750.753871,2747.839316,139.884844,2711.145350,2786.167517,2745.404935,-19.813962,0.010724,0.123504,2.804630,-28.473009
1104,1051,-18.180000,579,0.429062,1985.0,2024.0,3752.543112,3749.593246,59.858238,3704.294710,3804.856073,3804.856073,52.887944,0.036262,-2.320685,-2.718759,-43.720735
1105,593,44.505495,381,0.089783,2000.0,2024.0,1108.014030,1108.014030,0.000000,1108.014030,1108.014030,1108.014030,0.000000,-0.020029,9.764284,5.048498,165.540780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,38,13.380952,30,0.134921,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.001598,3.547069,0.000000,51.608630
1475,30,12.666667,22,0.194444,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.045329,7.993615,0.000000,136.077577
1476,38,19.190476,28,0.123016,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.004840,8.622056,0.000000,150.261638
1477,41,18.631579,23,0.148246,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.009193,5.179128,0.000000,111.646465


Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3101,163,1.166667,96,0.260901,2082.192135,2082.192135,0.000000,2082.192135,2082.192135,2082.192135,0.000000,0.027921,0.633818,4.383573
3102,598,117.087500,325,0.073226,1984.508622,1988.790357,50.106277,1922.191344,2067.621037,1925.042554,-5.185493,-0.023290,8.562612,0.000000
3103,637,55.714286,357,0.162935,2398.798202,2411.812358,76.239688,2320.285649,2447.995900,2396.330833,-9.812588,-0.002217,3.686001,2.584338
3104,607,43.319444,312,0.343062,3343.228764,3303.620011,139.048957,3268.246483,3529.050165,3529.050165,61.916965,-0.020828,-1.068784,-2.364225
3105,546,73.951613,342,0.171490,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.008178,5.579357,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,31,38.625000,19,0.020833,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.019527,9.696418,0.000000
3475,26,14.000000,14,0.227273,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.022439,2.682614,0.000000
3476,29,28.866667,15,0.088889,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.021125,8.933940,0.000000
3477,40,18.444444,26,0.198148,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.023146,6.092759,0.000000


In [27]:
def build_matchups(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams[['TeamID']]
    teams = pd.merge(teams, teams, how='cross')
    teams = teams.rename(columns={'TeamID_x': 'TeamID', 'TeamID_y': 'OTeamID'})
    teams = teams[teams['TeamID'] != teams['OTeamID']]
    teams = teams.set_index(['TeamID', 'OTeamID'])

    return teams

In [28]:
matchups_m = build_matchups('M')
matchups_w = build_matchups('W')

display(matchups_m)
display(matchups_w)

TeamID,OTeamID
1101,1102
1101,1103
1101,1104
1101,1105
1101,1106
...,...
1478,1473
1478,1474
1478,1475
1478,1476


TeamID,OTeamID
3101,3102
3101,3103
3101,3104
3101,3105
3101,3106
...,...
3478,3473
3478,3474
3478,3475
3478,3476


In [29]:
def build_df(history, matchups, avg):
    df = pd.merge(matchups, history, on=['TeamID', 'OTeamID'], how='left')
    df = df.fillna(avg).fillna(0)

    if 'FirstD1Season' in df.columns:
        df['FirstD1Season'] = df['FirstD1Season'].astype(int)
        df['LastD1Season'] = df['LastD1Season'].astype(int)
    
    return df

In [30]:
df_m = build_df(history_m, matchups_m, avg_m)
df_w = build_df(history_w, matchups_w, avg_w)

display(df_m)
display(df_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff,RankingsDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1101,1102,182.0,16.0,114.0,0.144307,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.013082,3.975460,3.384439,81.486596
1101,1103,182.0,16.0,114.0,0.144307,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.013082,3.975460,3.384439,81.486596
1101,1104,182.0,16.0,114.0,0.144307,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.013082,3.975460,3.384439,81.486596
1101,1105,182.0,16.0,114.0,0.144307,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.013082,3.975460,3.384439,81.486596
1101,1106,182.0,16.0,114.0,0.144307,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,0.013082,3.975460,3.384439,81.486596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,13.0,14.0,11.0,0.090909,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000823,6.574928,0.000000,140.023693
1478,1474,13.0,14.0,11.0,0.090909,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000823,6.574928,0.000000,140.023693
1478,1475,13.0,14.0,11.0,0.090909,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000823,6.574928,0.000000,140.023693
1478,1476,13.0,14.0,11.0,0.090909,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000823,6.574928,0.000000,140.023693


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3101,3102,163.0,1.166667,96.0,0.260901,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.027921,0.633818,4.383573
3101,3103,163.0,1.166667,96.0,0.260901,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.027921,0.633818,4.383573
3101,3104,163.0,1.166667,96.0,0.260901,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.027921,0.633818,4.383573
3101,3105,163.0,1.166667,96.0,0.260901,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.027921,0.633818,4.383573
3101,3106,163.0,1.166667,96.0,0.260901,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.027921,0.633818,4.383573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,13.0,22.250000,9.0,0.041667,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,-0.021486,4.644007,0.000000
3478,3474,13.0,22.250000,9.0,0.041667,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,-0.021486,4.644007,0.000000
3478,3475,13.0,22.250000,9.0,0.041667,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,-0.021486,4.644007,0.000000
3478,3476,13.0,22.250000,9.0,0.041667,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,-0.021486,4.644007,0.000000


## Feature analysis

In [31]:
corr_m = df_m.corr()
corr_m.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff,RankingsDiff
Games,1.0,-0.094032,0.993543,0.209849,-0.391695,0.213164,0.370731,0.370729,0.162138,0.367961,0.37202,0.369015,0.159539,0.084136,-0.25312,-0.085239,-0.203067
ScoreDiff,-0.094032,1.0,-0.057168,-0.496397,0.072978,-0.043669,-0.335933,-0.336271,0.100293,-0.344431,-0.325488,-0.340627,-0.222102,-0.402734,0.477989,0.336921,0.468786
Home,0.993543,-0.057168,1.0,0.165486,-0.379523,0.210726,0.31842,0.318377,0.176465,0.314966,0.320745,0.316227,0.131537,0.060861,-0.197938,-0.039822,-0.149834
WinRatio,0.209849,-0.496397,0.165486,1.0,-0.25171,0.132071,0.522213,0.522532,-0.043725,0.527091,0.514597,0.525287,0.274965,0.238109,-0.712751,-0.510147,-0.655299
FirstD1Season,-0.391695,0.072978,-0.379523,-0.25171,1.0,0.019527,-0.62942,-0.6286,-0.566708,-0.607875,-0.651119,-0.61723,-0.186541,-0.086885,0.288418,0.049853,0.31309
LastD1Season,0.213164,-0.043669,0.210726,0.132071,0.019527,1.0,0.199466,0.200371,-0.082981,0.216299,0.178025,0.211726,0.221572,0.063081,-0.182282,0.075249,0.058367
Rating_Mean,0.370731,-0.335933,0.31842,0.522213,-0.62942,0.199466,1.0,0.999958,0.178308,0.998491,0.997882,0.998178,0.483841,0.248152,-0.674101,-0.329119,-0.643667
Rating_Median,0.370729,-0.336271,0.318377,0.522532,-0.6286,0.200371,0.999958,1.0,0.176076,0.998577,0.997554,0.99813,0.484838,0.248218,-0.675052,-0.328338,-0.644108
Rating_Std,0.162138,0.100293,0.176465,-0.043725,-0.566708,-0.082981,0.178308,0.176076,1.0,0.139042,0.223419,0.15728,-0.118812,-0.037212,0.07489,0.150002,0.03018
Rating_Min,0.367961,-0.344431,0.314966,0.527091,-0.607875,0.216299,0.998491,0.998577,0.139042,1.0,0.992973,0.998174,0.500527,0.251874,-0.68202,-0.325583,-0.647999


In [32]:
corr_w = df_w.corr()
corr_w.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,PyWinDiff,RPIDiff,SeedDiff
Games,1.0,-0.051579,0.994083,0.222302,0.403085,0.402914,0.225957,0.39603,0.40953,0.400023,0.127527,0.057638,-0.176113,-0.051612
ScoreDiff,-0.051579,1.0,-0.026056,-0.519261,-0.372588,-0.37242,0.052005,-0.380998,-0.363658,-0.378485,-0.302226,-0.427475,0.53302,0.270127
Home,0.994083,-0.026056,1.0,0.190706,0.363474,0.363288,0.233325,0.356085,0.370498,0.360377,0.096985,0.042261,-0.130977,-0.016104
WinRatio,0.222302,-0.519261,0.190706,1.0,0.569907,0.569578,0.03137,0.572475,0.566255,0.574708,0.382928,0.244818,-0.674399,-0.416884
Rating_Mean,0.403085,-0.372588,0.363474,0.569907,1.0,0.999967,0.245801,0.999073,0.998081,0.997561,0.506668,0.258521,-0.61715,-0.236663
Rating_Median,0.402914,-0.37242,0.363288,0.569578,0.999967,1.0,0.243418,0.999111,0.997775,0.99729,0.506094,0.258605,-0.616704,-0.23558
Rating_Std,0.225957,0.052005,0.233325,0.03137,0.245801,0.243418,1.0,0.219347,0.287163,0.253683,-0.044652,-0.041247,0.003096,0.196283
Rating_Min,0.39603,-0.380998,0.356085,0.572475,0.999073,0.999111,0.219347,1.0,0.994958,0.996941,0.517314,0.263372,-0.623369,-0.232185
Rating_Max,0.40953,-0.363658,0.370498,0.566255,0.998081,0.997775,0.287163,0.994958,1.0,0.997111,0.499393,0.252257,-0.610294,-0.242456
Rating_Last,0.400023,-0.378485,0.360377,0.574708,0.997561,0.99729,0.253683,0.996941,0.997111,1.0,0.539717,0.260599,-0.62705,-0.24472


In [33]:
corr_m = df_m.corr()['WinRatio'].sort_values(ascending=False)
high_corr_m = corr_m[[abs(corr_m) > 0.1 for corr_m in corr_m]]

corr_w = df_w.corr()['WinRatio'].sort_values(ascending=False)
high_corr_w = corr_w[[abs(corr_w) > 0.1 for corr_w in corr_w]]

display(high_corr_m)
display(high_corr_w)

WinRatio         1.000000
Rating_Min       0.527091
Rating_Last      0.525287
Rating_Median    0.522532
Rating_Mean      0.522213
Rating_Max       0.514597
Rating_Trend     0.274965
PyWinDiff        0.238109
Games            0.209849
Home             0.165486
LastD1Season     0.132071
FirstD1Season   -0.251710
ScoreDiff       -0.496397
SeedDiff        -0.510147
RankingsDiff    -0.655299
RPIDiff         -0.712751
Name: WinRatio, dtype: float64

WinRatio         1.000000
Rating_Last      0.574708
Rating_Min       0.572475
Rating_Mean      0.569907
Rating_Median    0.569578
Rating_Max       0.566255
Rating_Trend     0.382928
PyWinDiff        0.244818
Games            0.222302
Home             0.190706
SeedDiff        -0.416884
ScoreDiff       -0.519261
RPIDiff         -0.674399
Name: WinRatio, dtype: float64

# Training

In [34]:
def score_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': device(),
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, n_jobs=-1, show_progress_bar=True)

    return study.best_params

In [35]:
def build_x_y(df):
    target_column = 'WinRatio'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [36]:
X_m, y_m = build_x_y(df_m)
X_w, y_w = build_x_y(df_w)

In [37]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]

In [38]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))

In [39]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3508
[LightGBM] [Info] Number of data points in the train set: 99754, number of used features: 16
[LightGBM] [Info] Start training from score 0.255470
LightGBM Model accuracy score: 0.9223
LightGBM Model accuracy score [train]: 0.9221
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3172
[LightGBM] [Info] Number of data points in the train set: 98700, number of used features: 13
[LightGBM] [Info] Start training from score 0.239142
LightGBM Model accuracy score: 0.9260
LightGBM Model accuracy score [train]: 0.9280


# Prediction

In [40]:
def build_wins(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)

    wins = X
    wins['WinRatio'] = reg.predict(X)
    wins = wins[['WinRatio']]

    return wins

In [41]:
wins_m = build_wins(X_m, y_m, params_m)
wins_w = build_wins(X_w, y_w, params_w)

display(wins_m)
display(wins_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3564
[LightGBM] [Info] Number of data points in the train set: 142506, number of used features: 16
[LightGBM] [Info] Start training from score 0.255905
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3237
[LightGBM] [Info] Number of data points in the train set: 141000, number of used features: 13
[LightGBM] [Info] Start training from score 0.239162


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,0.193718
1101,1103,0.193718
1101,1104,0.193718
1101,1105,0.193718
1101,1106,0.193718
...,...,...
1478,1473,0.117010
1478,1474,0.117010
1478,1475,0.117010
1478,1476,0.117010


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,0.284642
3101,3103,0.284642
3101,3104,0.284642
3101,3105,0.284642
3101,3106,0.284642
...,...,...
3478,3473,0.095976
3478,3474,0.095976
3478,3475,0.095976
3478,3476,0.095976


In [42]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    year = np.max(slots['Season'])
    
    slots = slots[slots['Season'] == year]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [43]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [44]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [45]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [46]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, wins):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - wins (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]

        team_1_prob = wins.loc[team_1, team_2].WinRatio
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, wins, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, wins)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [47]:
n_brackets = 100000
result_m = run_simulation(seeds_2024_m, slots_m, wins_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, wins_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 100000/100000 [15:53<00:00, 104.87it/s]
100%|██████████| 100000/100000 [15:48<00:00, 105.48it/s]


In [48]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W15
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
12599995,W,100000,R4Y1,Y12
12599996,W,100000,R4Z1,Z08
12599997,W,100000,R5WX,X12
12599998,W,100000,R5YZ,Z08


In [49]:
submission.to_csv('submission.csv')

# Resources
- General guidance: https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2 by _toshimelonhead_.
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.
- ELO rating calculation based on: https://www.kaggle.com/code/lennarthaupts/calculate-elo-ratings by _Lennart Haupts_.
- GPU suport based on: https://www.kaggle.com/code/albertespin/programmatically-check-if-gpu-is-enabled-in-kaggle by _Albert Espín_.