<a href="https://www.kaggle.com/code/tcordeu/march-madness-2024?scriptVersionId=167347050" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
from itertools import chain
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import linregress
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)

In [2]:
DATA_DIRS = ['/kaggle/input/march-machine-learning-mania-2024', '/kaggle/input/ncaa-men-538-team-ratings', '/kaggle/input/ncaa-women-538-team-ratings', '/kaggle/input/wncaa-basketball-espn-ratings']

In [3]:
CSV = {}

for path in list(chain(*map(lambda x: glob.glob(x + '/*.csv'), DATA_DIRS))):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')

In [4]:
CSV['538ratingsM'] = CSV.pop('538ratingsMen')
CSV['538ratingsW'] = CSV.pop('538ratingsWomen')
CSV['ESPNW'] = CSV.pop('wncaa_espn')

In [5]:
def device():
    from tensorflow.python.client import device_lib

    return 'gpu' if len(list(filter(lambda x: x.device_type == 'GPU', device_lib.list_local_devices()))) > 0 else 'cpu'

In [6]:
print("Device: {}".format(device()))

2024-03-16 18:36:48.951884: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-16 18:36:48.951993: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-16 18:36:49.087398: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: cpu


# Build DF

## Feature Engineering

In [7]:
def build_results(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    return pd.concat(csvs)

In [8]:
results_m = build_results('M')
results_w = build_results('W')

display(results_m)
display(results_w)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
187168,2024,128,1437,58,1177,57,N,0
187169,2024,128,1448,72,1323,59,N,0
187170,2024,128,1455,88,1349,81,N,0
187171,2024,128,1462,76,1139,72,N,0


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0
...,...,...,...,...,...,...,...,...
131515,2024,128,3405,72,3444,61,N,0
131516,2024,128,3413,81,3168,56,N,0
131517,2024,128,3424,66,3361,49,H,0
131518,2024,128,3426,67,3381,53,N,0


In [9]:
def build_teams(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams.drop('TeamName', axis=1)
    teams = teams.set_index('TeamID')
    
    return teams

In [10]:
teams_m = build_teams('M')
teams_w = build_teams('W') # FIXME: Maybe useless since there is no data aside from TeamName.

display(teams_m)
display(teams_w)

Unnamed: 0_level_0,FirstD1Season,LastD1Season
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1
1101,2014,2024
1102,1985,2024
1103,1985,2024
1104,1985,2024
1105,2000,2024
...,...,...
1474,2023,2024
1475,2023,2024
1476,2023,2024
1477,2023,2024


3101
3102
3103
3104
3105
...
3474
3475
3476
3477
3478


In [11]:
def build_espn_votes(gender):
    votes = CSV["ESPN{}".format(gender)].copy()
    votes = votes[['Season', 'TeamID', 'Votes']]

    total_votes = votes[['Season', 'Votes']].groupby('Season').sum().rename(columns={'Votes': 'TotalVotes'}).reset_index()

    votes = pd.merge(votes, total_votes, on='Season')
    votes['ESPNVotePct'] = (votes['Votes'] / votes['TotalVotes']) * 100
    votes = votes[['TeamID', 'ESPNVotePct']].groupby('TeamID').mean()
    
    return votes

In [12]:
espn_votes_w = build_espn_votes('W')

display(espn_votes_w)

Unnamed: 0_level_0,ESPNVotePct
TeamID,Unnamed: 1_level_1
3107,0.067308
3112,2.809553
3113,2.067484
3114,0.052885
3116,2.788462
...,...
3443,0.038462
3449,4.567308
3450,0.365385
3452,1.089744


In [13]:
def calculate_elo(teams, data, initial_rating=2000, k=140, alpha=None):
    '''
    Calculate Elo ratings for each team based on match data.

    Parameters:
    - teams (array-like): Containing Team-IDs.
    - data (pd.DataFrame): DataFrame with all matches in chronological order.
    - initial_rating (float): Initial rating of an unranked team (default: 2000).
    - k (float): K-factor, determining the impact of each match on team ratings (default: 140).
    - alpha (float or None): Tuning parameter for the multiplier for the margin of victory. No multiplier if None.

    Returns: 
    - list: Historical ratings of the winning team (WTeam).
    - list: Historical ratings of the losing team (LTeam).
    '''
    
    # Dictionary to keep track of current ratings for each team
    team_dict = {}
    for team in teams:
        team_dict[team] = initial_rating
        
    # Lists to store ratings for each team in each game
    r1, r2 = [], []
    margin_of_victory = 1

    # Iterate through the game data
    for wteam, lteam, ws, ls  in tqdm(zip(data.WTeamID, data.LTeamID, data.WScore, data.LScore), total=len(data)):
        # Append current ratings for teams to lists
        r1.append(team_dict[wteam])
        r2.append(team_dict[lteam])

        # Calculate expected outcomes based on Elo ratings
        rateW = 1 / (1 + 10 ** ((team_dict[lteam] - team_dict[wteam]) / initial_rating))
        rateL = 1 / (1 + 10 ** ((team_dict[wteam] - team_dict[lteam]) / initial_rating))
        
        if alpha:
            margin_of_victory = (ws - ls)/alpha

        # Update ratings for winning and losing teams
        team_dict[wteam] += k * margin_of_victory * (1 - rateW)
        team_dict[lteam] += k * margin_of_victory * (0 - rateL)

        # Ensure that ratings do not go below 1
        if team_dict[lteam] < 1:
            team_dict[lteam] = 1
        
    return r1, r2

def create_elo_data(teams, data, initial_rating=2000, k=140, alpha=None):
    '''
    Create a DataFrame with summary statistics of Elo ratings for teams based on historical match data.

    Parameters:
    - teams (array-like): Containing Team-IDs.
    - data (pd.DataFrame): DataFrame with all matches in chronological order.
    - initial_rating (float): Initial rating of an unranked team (default: 2000).
    - k (float): K-factor, determining the impact of each match on team ratings (default: 140).

    Returns: 
    - DataFrame: Summary statistics of Elo ratings for teams throughout a season.
    '''
    
    r1, r2 = calculate_elo(teams, data, initial_rating, k, alpha)
    
    # Concatenate arrays vertically
    seasons = np.concatenate([data.Season, data.Season])
    days = np.concatenate([data.DayNum, data.DayNum])
    teams = np.concatenate([data.WTeamID, data.LTeamID])
    tourney = np.concatenate([data.tourney, data.tourney])
    ratings = np.concatenate([r1, r2])
    # Create a DataFrame
    rating_df = pd.DataFrame({
        'Season': seasons,
        'DayNum': days,
        'TeamID': teams,
        'Rating': ratings,
        'Tourney': tourney
    })

    # Sort DataFrame and remove tournament data
    rating_df.sort_values(['TeamID', 'Season', 'DayNum'], inplace=True)
    rating_df = rating_df[rating_df['Tourney'] == 0]
    grouped = rating_df.groupby(['TeamID', 'Season'])
    results = grouped['Rating'].agg(['mean', 'median', 'std', 'min', 'max', 'last'])
    results.columns = ['Rating_Mean', 'Rating_Median', 'Rating_Std', 'Rating_Min', 'Rating_Max', 'Rating_Last']
    results['Rating_Trend'] = grouped.apply(lambda x: linregress(range(len(x)), x['Rating']).slope, include_groups=False)
    results.reset_index(inplace=True)
    
    return results

In [14]:
def build_elo(gender, results, teams):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))

    tourneys = results.copy()
    tourneys['tourney'] = 0
    tourneys.loc[len(csvs[0]):, 'tourney'] = 1
    tourneys = tourneys.sort_values(['Season', 'DayNum'])
    
    return create_elo_data(teams.reset_index().TeamID, tourneys).drop('Season', axis=1).groupby('TeamID').mean()

In [15]:
elo_m = build_elo('M', results_m, teams_m)
elo_w = build_elo('W', results_w, teams_w)

display(elo_m)
display(elo_w)

100%|██████████| 189624/189624 [00:00<00:00, 540109.33it/s]
  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
100%|██████████| 133103/133103 [00:00<00:00, 536348.00it/s]
  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


Unnamed: 0_level_0,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1101,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628
1102,2678.124707,2686.604784,214.613224,2561.394128,2763.590190,2561.394128,-43.897197
1103,2750.753871,2747.839316,139.884844,2711.145350,2786.167517,2745.404935,-19.813962
1104,3752.543112,3749.593246,59.858238,3704.294710,3804.856073,3804.856073,52.887944
1105,1108.014030,1108.014030,,1108.014030,1108.014030,1108.014030,
...,...,...,...,...,...,...,...
1460,2521.630842,2521.630842,36.888624,2515.109793,2528.151891,2528.151891,52.168392
1461,2996.400014,2993.142619,74.402542,2956.520610,3036.892349,3035.401649,64.841331
1462,3673.402601,3672.555650,55.840404,3635.467714,3712.211378,3709.733564,55.285432
1463,2585.593927,2586.981613,73.491924,2542.662895,2630.514606,2588.214276,59.662786


Unnamed: 0_level_0,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3101,2082.192135,2082.192135,,2082.192135,2082.192135,2082.192135,
3102,1984.508622,1988.790357,50.106277,1922.191344,2067.621037,1925.042554,-5.185493
3103,2398.798202,2411.812358,76.239688,2320.285649,2447.995900,2396.330833,-9.812588
3104,3343.228764,3303.620011,139.048957,3268.246483,3529.050165,3529.050165,61.916965
3106,1703.142594,1706.371227,54.592245,1687.099354,1725.899390,1687.099354,1.732950
...,...,...,...,...,...,...,...
3460,2693.141781,2685.545374,99.721223,2641.067568,2762.861693,2678.396161,45.100111
3461,2702.648620,2699.856369,82.784696,2658.883327,2745.437872,2678.542297,-26.319625
3462,3642.380183,3641.255421,57.088424,3611.791013,3676.888721,3669.153178,43.484173
3463,1964.640460,1997.180859,78.850412,1816.791878,2070.000000,1898.964063,-19.185057


In [16]:
def winner(ids):
    id, wId, lId = ids

    return int(id == wId)

def opponent(x):
    winInt, wId, lId = x
    win = not winInt
    
    return wId if win else lId

def score_diff(x):
    winInt, wScore, lScore = x
    win = not winInt
    
    return (wScore - lScore) if win else (lScore - wScore)

def build_season_results(df):
    season_results = df
    season_results['TeamID'] = season_results[['WTeamID', 'LTeamID']].values.tolist()
    season_results = season_results.explode('TeamID')
    season_results['Win'] = season_results[['TeamID', 'WTeamID', 'LTeamID']].apply(winner, axis=1)
    season_results['Defeat'] = season_results['Win'].apply(lambda x: 1 - x)
    season_results['Games'] = season_results['Win'] + season_results['Defeat']
    season_results['ScoreDiff'] = season_results[['Win', 'WScore', 'LScore']].apply(score_diff, axis=1)
    season_results['OTeamID'] = season_results[['Win', 'WTeamID', 'LTeamID']].apply(opponent, axis=1)
    season_results['Home'] = season_results['WLoc'].apply(lambda x: int(x[0] == 'H'))
    season_results = season_results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc', 'NumOT'], axis=1)
    season_results = season_results.groupby(by=['TeamID', 'OTeamID']).sum()
    season_results['WinRatio'] = season_results['Win'] / season_results['Games']
    season_results = season_results.drop(['Win', 'Defeat'], axis=1)

    return season_results

In [17]:
season_results_m = build_season_results(results_m)
season_results_w = build_season_results(results_w)

display(season_results_m)
display(season_results_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1101,1102,1,-4,0,1.0
1101,1115,1,-8,0,1.0
1101,1116,2,23,2,0.0
1101,1117,2,-7,2,0.5
1101,1122,1,-8,0,1.0
...,...,...,...,...,...
1478,1384,2,-53,1,1.0
1478,1437,1,26,1,0.0
1478,1447,2,9,2,0.5
1478,1467,3,30,2,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3101,3102,1,-36,1,1.0
3101,3106,1,-11,1,1.0
3101,3114,1,-5,0,1.0
3101,3116,2,26,1,0.0
3101,3117,1,-12,0,1.0
...,...,...,...,...,...
3478,3425,1,51,1,0.0
3478,3433,1,23,1,0.0
3478,3447,2,-20,1,1.0
3478,3467,2,-22,1,1.0


In [18]:
def build_rpi(results):
    win_pct = results.copy()[['WinRatio']]
    win_pct = win_pct.groupby('TeamID').mean()
    win_pct['WP'] = win_pct['WinRatio'] * 100
    win_pct = win_pct.drop('WinRatio', axis=1)
    
    rpi = results.copy().reset_index()
    rpi = pd.merge(rpi, win_pct, on=['TeamID'])
    rpi = pd.merge(rpi, win_pct, left_on=['OTeamID'], right_on=['TeamID'], suffixes=('_T', '_O'))
    
    wp_oo = rpi[['TeamID', 'WP_O']].groupby('TeamID').mean()
    wp_oo = wp_oo.rename(columns={'WP_O': 'WP_OO'})
    
    rpi = pd.merge(rpi, wp_oo, left_on=['OTeamID'], right_on=['TeamID'])

    rpi['RPI'] = (rpi['WP_T'] * 0.25) + (rpi['WP_O'] * 0.50) + (rpi['WP_OO'] * 0.25)
    
    return rpi[['TeamID', 'OTeamID', 'RPI']].set_index(['TeamID', 'OTeamID'])

In [19]:
rpi_m = build_rpi(season_results_m)
rpi_w = build_rpi(season_results_w)

display(rpi_m)
display(rpi_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,RPI
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,45.828795
1101,1115,30.683841
1101,1116,61.888066
1101,1117,45.986394
1101,1122,41.513382
...,...,...
1478,1384,33.572319
1478,1437,60.463877
1478,1447,38.732833
1478,1467,37.660099


Unnamed: 0_level_0,Unnamed: 1_level_0,RPI
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,36.733523
3101,3106,36.316747
3101,3114,52.313580
3101,3116,64.300687
3101,3117,48.628259
...,...,...
3478,3425,57.659592
3478,3433,50.693507
3478,3447,32.834127
3478,3467,39.032746


In [20]:
def clean_seeds(seed):
    res = seed[1:]

    if len(res) > 2:
        res = res[:-1]

    return int(res)

def build_seeds(gender):
    seeds = CSV["{}NCAATourneySeeds".format(gender)] 
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)
    seeds = seeds.drop('Season', axis=1)
    seeds = seeds.groupby(by='TeamID').mean()
    
    return seeds

In [21]:
seeds_m = build_seeds('M')
seeds_w = build_seeds('W')

display(seeds_m)
display(seeds_w)

Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
1101,14.500000
1102,12.000000
1103,13.600000
1104,5.894737
1105,16.000000
...,...
1459,12.200000
1460,15.000000
1461,10.800000
1462,8.074074


Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
3101,16.000000
3103,13.000000
3104,6.000000
3106,15.333333
3107,14.285714
...,...
3458,7.000000
3460,13.333333
3461,12.500000
3462,5.888889


In [22]:
def build_rankings(gender):
    rankings = CSV["{}MasseyOrdinals".format(gender)]
    rankings = rankings.drop(['SystemName', 'RankingDayNum'], axis=1)
    rankings = rankings.groupby(by='TeamID').mean()
    rankings = rankings.drop('Season', axis=1)

    return rankings

In [23]:
rankings_m = build_rankings('M')

rankings_m

Unnamed: 0_level_0,OrdinalRank
TeamID,Unnamed: 1_level_1
1101,231.697488
1102,184.409115
1103,111.700294
1104,55.244180
1105,320.080397
...,...
1474,229.931136
1475,290.941176
1476,330.658983
1477,296.072941


In [24]:
def build_rating(gender):
    ranks = CSV["538ratings{}".format(gender)].copy()
    ranks = ranks[['TeamID', '538rating']].groupby('TeamID').mean()
    
    return ranks

In [25]:
rating_m = build_rating('M')
rating_w = build_rating('W')

display(rating_m)
display(rating_w)

Unnamed: 0_level_0,538rating
TeamID,Unnamed: 1_level_1
1101,74.6850
1103,76.6900
1104,87.2125
1111,70.2800
1112,89.1620
...,...
1459,84.3800
1460,74.2800
1461,78.5200
1462,85.8275


Unnamed: 0_level_0,538rating
TeamID,Unnamed: 1_level_1
3101,65.110000
3104,83.565000
3106,59.870000
3107,73.460000
3110,70.405000
...,...
3450,83.646667
3452,86.200000
3453,85.216667
3460,75.225000


In [26]:
def build_history(season_results, seeds, teams, elo, rpi, rating, rankings=None, votes=None):
    history = season_results.join(teams, on='TeamID').join(seeds, on='TeamID').join(elo, on='TeamID').join(rpi, on=['TeamID', 'OTeamID']).join(rating, on=['TeamID'])
    history = history.reset_index()
    history = pd.merge(history, rpi.reset_index().rename(columns={'TeamID': 'OTeamID', 'OTeamID': 'TeamID'}), on=['TeamID', 'OTeamID'], suffixes=('_T', '_O'))
    history = pd.merge(history, seeds, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
    history = pd.merge(history, rating, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
    history['RPIDiff'] = history['RPI_T'] - history['RPI_O']
    history['SeedDiff'] = history['Seed_T'] - history['Seed_O']
    history['538ratingDiff'] = history['538rating_T'] - history['538rating_O']
    history = history.drop(['538rating_T', '538rating_O', 'RPI_T', 'RPI_O', 'Seed_T', 'Seed_O'], axis=1)

    if rankings is not None:
        history = history.join(rankings, on='TeamID')
        history = pd.merge(history, rankings, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
        history['RankingsDiff'] = history['OrdinalRank_T'] - history['OrdinalRank_O']
        history = history.drop(['OrdinalRank_T', 'OrdinalRank_O'], axis=1)
        
    if votes is not None:
        history = history.join(votes, on='TeamID')
        history = pd.merge(history, votes, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
        history['ESPNVotesDiff'] = history['ESPNVotePct_T'] - history['ESPNVotePct_O']
        history = history.drop(['ESPNVotePct_T', 'ESPNVotePct_O'], axis=1)
    
    return history.set_index(['TeamID', 'OTeamID']).fillna(0)

In [27]:
history_m = build_history(season_results_m, seeds_m, teams_m, elo_m, rpi_m, rating_m, rankings=rankings_m)
history_w = build_history(season_results_w, seeds_w, teams_w, elo_w, rpi_w, rating_w, votes=espn_votes_w)

display(history_m)
display(history_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,RankingsDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1101,1116,2,23,2,0.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,9.768574,8.239130,-10.237,171.700251
1101,1122,1,-8,0,1.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,-0.583711,0.100000,5.855,11.552984
1101,1129,1,44,1,0.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,4.972194,2.250000,-7.855,135.170200
1101,1161,1,8,1,0.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,6.574040,4.666667,-6.965,119.527677
1101,1167,2,-16,1,1.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,-2.845438,-0.500000,-0.285,-4.144314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1207,1,37,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,12.613225,0.000000,0.000,244.996728
1478,1254,2,-1,2,0.5,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.639693,0.000000,0.000,63.105397
1478,1336,1,17,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.818909,0.000000,0.000,217.467475
1478,1364,1,24,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.260115,0.000000,0.000,166.886715


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,ESPNVotesDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3101,3114,1,-5,0,1.0,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.741691,3.666667,-8.810000,0.0
3101,3116,2,26,1,0.0,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,7.343209,8.375000,-21.395000,0.0
3101,3124,2,102,2,0.0,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,10.618084,13.333333,-35.054286,0.0
3101,3195,2,55,1,0.0,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,5.868209,4.444444,-15.340000,0.0
3101,3239,1,-4,0,1.0,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-3.453943,0.000000,-4.880000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3477,3450,1,61,1,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,7.797197,0.000000,0.000000,0.0
3478,3138,1,-5,0,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,4.097226,0.000000,0.000000,0.0
3478,3160,1,59,1,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,10.718671,0.000000,0.000000,0.0
3478,3161,1,20,1,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,7.544704,0.000000,0.000000,0.0


In [28]:
def build_avg(history):
    agg = {}
    for col in history.columns:
        if col == 'Games' or col == 'Home':
            agg[col] = 'sum'
        else:
            agg[col] = 'mean'
    
    avg = history.groupby('TeamID').agg(agg)
    
    return avg

In [29]:
avg_m = build_avg(history_m)
avg_w = build_avg(history_w)

display(avg_m)
display(avg_w)

Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,RankingsDiff
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1101,97,18.512195,63,0.217615,2014.0,2024.0,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,4.858284,4.383671,-5.473512,101.622932
1102,637,46.912500,361,0.376270,1985.0,2024.0,2678.124707,2686.604784,214.613224,2561.394128,2763.590190,2561.394128,-43.897197,3.128270,1.547475,0.000000,52.120318
1103,495,0.212766,301,0.478470,1985.0,2024.0,2750.753871,2747.839316,139.884844,2711.145350,2786.167517,2745.404935,-19.813962,0.099596,2.871796,-2.588560,-23.222170
1104,1031,-20.403101,585,0.637980,1985.0,2024.0,3752.543112,3749.593246,59.858238,3704.294710,3804.856073,3804.856073,52.887944,-3.397025,-3.434336,5.916939,-56.313887
1105,264,50.218182,187,0.098507,2000.0,2024.0,1108.014030,1108.014030,0.000000,1108.014030,1108.014030,1108.014030,0.000000,11.428478,6.280480,0.000000,201.580764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,31,7.352941,21,0.374510,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.100534,0.000000,0.000000,45.542172
1475,19,12.700000,13,0.100000,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.683721,0.000000,0.000000,160.532626
1476,19,22.583333,13,0.145833,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.094094,0.000000,0.000000,176.718037
1477,21,19.785714,13,0.196429,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.488094,0.000000,0.000000,154.344163


Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,ESPNVotesDiff
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3101,37,44.250000,22,0.179825,2082.192135,2082.192135,0.000000,2082.192135,2082.192135,2082.192135,0.000000,4.775623,6.929696,-17.609982,0.0
3102,227,172.760000,124,0.020426,1984.508622,1988.790357,50.106277,1922.191344,2067.621037,1925.042554,-5.185493,12.603872,0.000000,0.000000,0.0
3103,166,41.515152,89,0.178272,2398.798202,2411.812358,76.239688,2320.285649,2447.995900,2396.330833,-9.812588,6.636945,4.936887,0.000000,0.0
3104,457,49.016667,226,0.512994,3343.228764,3303.620011,139.048957,3268.246483,3529.050165,3529.050165,61.916965,-0.083045,-1.984626,-1.011743,0.0
3105,46,54.913043,38,0.203557,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.091433,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,13,42.875000,9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,13.927912,0.000000,0.000000,0.0
3475,5,-1.000000,3,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.636454,0.000000,0.000000,0.0
3476,6,36.750000,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,12.391015,0.000000,0.000000,0.0
3477,5,34.400000,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11.103849,0.000000,0.000000,0.0


In [30]:
def build_matchups(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams[['TeamID']]
    teams = pd.merge(teams, teams, how='cross')
    teams = teams.rename(columns={'TeamID_x': 'TeamID', 'TeamID_y': 'OTeamID'})
    teams = teams[teams['TeamID'] != teams['OTeamID']]
    teams = teams.set_index(['TeamID', 'OTeamID'])

    return teams

In [31]:
matchups_m = build_matchups('M')
matchups_w = build_matchups('W')

display(matchups_m)
display(matchups_w)

TeamID,OTeamID
1101,1102
1101,1103
1101,1104
1101,1105
1101,1106
...,...
1478,1473
1478,1474
1478,1475
1478,1476


TeamID,OTeamID
3101,3102
3101,3103
3101,3104
3101,3105
3101,3106
...,...
3478,3473
3478,3474
3478,3475
3478,3476


In [32]:
def build_df(history, matchups, avg):
    df = pd.merge(matchups, history, on=['TeamID', 'OTeamID'], how='left')
    df = df.fillna(avg).fillna(0)

    if 'FirstD1Season' in df.columns:
        df['FirstD1Season'] = df['FirstD1Season'].astype(int)
        df['LastD1Season'] = df['LastD1Season'].astype(int)
    
    return df

In [33]:
df_m = build_df(history_m, matchups_m, avg_m)
df_w = build_df(history_w, matchups_w, avg_w)

display(df_m)
display(df_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,RankingsDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1101,1102,97.0,18.512195,63.0,0.217615,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,4.858284,4.383671,-5.473512,101.622932
1101,1103,97.0,18.512195,63.0,0.217615,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,4.858284,4.383671,-5.473512,101.622932
1101,1104,97.0,18.512195,63.0,0.217615,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,4.858284,4.383671,-5.473512,101.622932
1101,1105,97.0,18.512195,63.0,0.217615,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,4.858284,4.383671,-5.473512,101.622932
1101,1106,97.0,18.512195,63.0,0.217615,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,4.858284,4.383671,-5.473512,101.622932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,9.0,13.500000,7.0,0.194444,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.329756,0.000000,0.000000,169.674444
1478,1474,9.0,13.500000,7.0,0.194444,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.329756,0.000000,0.000000,169.674444
1478,1475,9.0,13.500000,7.0,0.194444,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.329756,0.000000,0.000000,169.674444
1478,1476,9.0,13.500000,7.0,0.194444,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.329756,0.000000,0.000000,169.674444


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,ESPNVotesDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3101,3102,37.0,44.25,22.0,0.179825,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,4.775623,6.929696,-17.609982,0.0
3101,3103,37.0,44.25,22.0,0.179825,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,4.775623,6.929696,-17.609982,0.0
3101,3104,37.0,44.25,22.0,0.179825,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,4.775623,6.929696,-17.609982,0.0
3101,3105,37.0,44.25,22.0,0.179825,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,4.775623,6.929696,-17.609982,0.0
3101,3106,37.0,44.25,22.0,0.179825,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,4.775623,6.929696,-17.609982,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,4.0,21.25,2.0,0.250000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,6.848341,0.000000,0.000000,0.0
3478,3474,4.0,21.25,2.0,0.250000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,6.848341,0.000000,0.000000,0.0
3478,3475,4.0,21.25,2.0,0.250000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,6.848341,0.000000,0.000000,0.0
3478,3476,4.0,21.25,2.0,0.250000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,6.848341,0.000000,0.000000,0.0


## Feature analysis

In [34]:
corr_m = df_m.corr()
corr_m.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,RankingsDiff
Games,1.0,-0.152044,0.992511,0.315939,-0.361741,0.200677,0.449706,0.449692,0.087123,0.44881,0.4486,0.44842,0.182303,-0.379889,-0.243231,0.220321,-0.329291
ScoreDiff,-0.152044,1.0,-0.111687,-0.51599,0.078332,-0.038539,-0.367806,-0.368189,0.105963,-0.376143,-0.35732,-0.372382,-0.239284,0.522121,0.39821,-0.292925,0.514094
Home,0.992511,-0.111687,1.0,0.269622,-0.360084,0.204712,0.403921,0.403821,0.107104,0.402142,0.404096,0.401999,0.152677,-0.323311,-0.194537,0.18864,-0.271376
WinRatio,0.315939,-0.51599,0.269622,1.0,-0.264111,0.135855,0.591858,0.592342,-0.059113,0.597178,0.58305,0.594942,0.325266,-0.806832,-0.61962,0.465995,-0.75255
FirstD1Season,-0.361741,0.078332,-0.360084,-0.264111,1.0,0.019527,-0.62942,-0.6286,-0.566708,-0.607875,-0.651119,-0.61723,-0.186541,0.295364,0.063989,-0.149052,0.317905
LastD1Season,0.200677,-0.038539,0.204712,0.135855,0.019527,1.0,0.199466,0.200371,-0.082981,0.216299,0.178025,0.211726,0.221572,-0.188814,0.067381,-0.016876,0.059045
Rating_Mean,0.449706,-0.367806,0.403921,0.591858,-0.62942,0.199466,1.0,0.999958,0.178308,0.998491,0.997882,0.998178,0.483841,-0.718172,-0.389077,0.326447,-0.689785
Rating_Median,0.449692,-0.368189,0.403821,0.592342,-0.6286,0.200371,0.999958,1.0,0.176076,0.998577,0.997554,0.99813,0.484838,-0.719096,-0.388271,0.325786,-0.690184
Rating_Std,0.087123,0.105963,0.107104,-0.059113,-0.566708,-0.082981,0.178308,0.176076,1.0,0.139042,0.223419,0.15728,-0.118812,0.096521,0.173936,-0.05337,0.057015
Rating_Min,0.44881,-0.376143,0.402142,0.597178,-0.607875,0.216299,0.998491,0.998577,0.139042,1.0,0.992973,0.998174,0.500527,-0.726581,-0.386408,0.32296,-0.694609


In [35]:
corr_w = df_w.corr()
corr_w.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,538ratingDiff,ESPNVotesDiff
Games,1.0,-0.274397,0.993897,0.628081,0.652517,0.652035,0.031292,0.652845,0.650887,0.656142,0.394111,-0.625543,-0.404127,0.272767,0.177279
ScoreDiff,-0.274397,1.0,-0.244013,-0.547654,-0.407459,-0.407336,0.086368,-0.415949,-0.397784,-0.412478,-0.324794,0.575749,0.287264,-0.228952,-0.285776
Home,0.993897,-0.244013,1.0,0.597008,0.633632,0.633132,0.044907,0.633344,0.632857,0.636977,0.371831,-0.587008,-0.376919,0.249797,0.163758
WinRatio,0.628081,-0.547654,0.597008,1.0,0.71924,0.718766,0.022463,0.723291,0.714496,0.7263,0.516221,-0.792305,-0.455304,0.313867,0.243213
Rating_Mean,0.652517,-0.407459,0.633632,0.71924,1.0,0.999967,0.245801,0.999073,0.998081,0.997561,0.506668,-0.687844,-0.168721,0.11469,0.120674
Rating_Median,0.652035,-0.407336,0.633132,0.718766,0.999967,1.0,0.243418,0.999111,0.997775,0.99729,0.506094,-0.687318,-0.167603,0.114269,0.119957
Rating_Std,0.031292,0.086368,0.044907,0.022463,0.245801,0.243418,1.0,0.219347,0.287163,0.253683,-0.044652,0.027218,0.2707,-0.143745,-0.078644
Rating_Min,0.652845,-0.415949,0.633344,0.723291,0.999073,0.999111,0.219347,1.0,0.994958,0.996941,0.517314,-0.694876,-0.16318,0.10833,0.118862
Rating_Max,0.650887,-0.397784,0.632857,0.714496,0.998081,0.997775,0.287163,0.994958,1.0,0.997111,0.499393,-0.680005,-0.174914,0.120724,0.123731
Rating_Last,0.656142,-0.412478,0.636977,0.7263,0.997561,0.99729,0.253683,0.996941,0.997111,1.0,0.539717,-0.698351,-0.175703,0.116206,0.121733


In [36]:
corr_m = df_m.corr()['WinRatio'].sort_values(ascending=False)
high_corr_m = corr_m[[abs(corr_m) > 0.1 for corr_m in corr_m]]

corr_w = df_w.corr()['WinRatio'].sort_values(ascending=False)
high_corr_w = corr_w[[abs(corr_w) > 0.1 for corr_w in corr_w]]

display(high_corr_m)
display(high_corr_w)

WinRatio         1.000000
Rating_Min       0.597178
Rating_Last      0.594942
Rating_Median    0.592342
Rating_Mean      0.591858
Rating_Max       0.583050
538ratingDiff    0.465995
Rating_Trend     0.325266
Games            0.315939
Home             0.269622
LastD1Season     0.135855
FirstD1Season   -0.264111
ScoreDiff       -0.515990
SeedDiff        -0.619620
RankingsDiff    -0.752550
RPIDiff         -0.806832
Name: WinRatio, dtype: float64

WinRatio         1.000000
Rating_Last      0.726300
Rating_Min       0.723291
Rating_Mean      0.719240
Rating_Median    0.718766
Rating_Max       0.714496
Games            0.628081
Home             0.597008
Rating_Trend     0.516221
538ratingDiff    0.313867
ESPNVotesDiff    0.243213
SeedDiff        -0.455304
ScoreDiff       -0.547654
RPIDiff         -0.792305
Name: WinRatio, dtype: float64

# Training

In [37]:
def score_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': device(),
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, n_jobs=-1, show_progress_bar=True)

    return study.best_params

In [38]:
def build_x_y(df):
    target_column = 'WinRatio'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [39]:
X_m, y_m = build_x_y(df_m)
X_w, y_w = build_x_y(df_w)

In [40]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [41]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))

In [42]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018984 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3537
[LightGBM] [Info] Number of data points in the train set: 99754, number of used features: 16
[LightGBM] [Info] Start training from score 0.368845
LightGBM Model accuracy score: 0.9184
LightGBM Model accuracy score [train]: 0.9188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018794 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3441
[LightGBM] [Info] Number of data points in the train set: 98700, number of used features: 14
[LightGBM] [Info] Start training from score 0.249685
LightGBM Model accuracy score: 0.9274
LightGBM Model accuracy score [train]: 0.9291


# Prediction

In [43]:
def build_wins(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)

    wins = X
    wins['WinRatio'] = reg.predict(X)
    wins = wins[['WinRatio']]

    return wins

In [44]:
wins_m = build_wins(X_m, y_m, params_m)
wins_w = build_wins(X_w, y_w, params_w)

display(wins_m)
display(wins_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3595
[LightGBM] [Info] Number of data points in the train set: 142506, number of used features: 16
[LightGBM] [Info] Start training from score 0.369612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024972 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3498
[LightGBM] [Info] Number of data points in the train set: 141000, number of used features: 14
[LightGBM] [Info] Start training from score 0.248981


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,0.292387
1101,1103,0.292387
1101,1104,0.292387
1101,1105,0.292387
1101,1106,0.292387
...,...,...
1478,1473,0.220048
1478,1474,0.220048
1478,1475,0.220048
1478,1476,0.220048


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,0.194663
3101,3103,0.194663
3101,3104,0.194663
3101,3105,0.194663
3101,3106,0.194663
...,...,...
3478,3473,0.236440
3478,3474,0.236440
3478,3475,0.236440
3478,3476,0.236440


In [45]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    year = np.max(slots['Season'])
    
    slots = slots[slots['Season'] == year]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [46]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15
2387,2023,R1W3,W03,W14
2388,2023,R1W4,W04,W13
2389,2023,R1W5,W05,W12
...,...,...,...,...
2443,2023,R4Y1,R3Y1,R3Y2
2444,2023,R4Z1,R3Z1,R3Z2
2445,2023,R5WX,R4W1,R4X1
2446,2023,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1579,2023,R1W1,W01,W16
1580,2023,R1W2,W02,W15
1581,2023,R1W3,W03,W14
1582,2023,R1W4,W04,W13
1583,2023,R1W5,W05,W12
...,...,...,...,...
1637,2023,R4Y1,R3Y1,R3Y2
1638,2023,R4Z1,R3Z1,R3Z2
1639,2023,R5WX,R4W1,R4X1
1640,2023,R5YZ,R4Y1,R4Z1


In [47]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [48]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
59,M,Z12,1433
60,M,Z13,1233
61,M,Z14,1213
62,M,Z15,1421


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3268
66,W,W03,3323
67,W,W04,3417
68,W,W05,3328
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


In [49]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, wins):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - wins (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]

        team_1_prob = wins.loc[team_1, team_2].WinRatio
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, wins, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, wins)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [50]:
n_brackets = 100000
result_m = run_simulation(seeds_2024_m, slots_m, wins_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, wins_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 100000/100000 [12:48<00:00, 130.14it/s]
100%|██████████| 100000/100000 [12:53<00:00, 129.31it/s]


In [51]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
12599995,W,100000,R4Y1,Y06
12599996,W,100000,R4Z1,Z02
12599997,W,100000,R5WX,W02
12599998,W,100000,R5YZ,Z02


In [52]:
submission.to_csv('submission.csv')

# Resources
- General guidance: https://www.kaggle.com/code/toshimelonhead/ncaa-march-madness-sabermetric-spin-v2 by _toshimelonhead_.
- Simulation based on: https://www.kaggle.com/code/lennarthaupts/simulate-n-brackets by _Lennart Haupts_.
- ELO rating calculation based on: https://www.kaggle.com/code/lennarthaupts/calculate-elo-ratings by _Lennart Haupts_.
- GPU support based on: https://www.kaggle.com/code/albertespin/programmatically-check-if-gpu-is-enabled-in-kaggle by _Albert Espín_.
- 538 Ratings: https://www.kaggle.com/datasets/raddar/ncaa-men-538-team-ratings and https://www.kaggle.com/datasets/raddar/ncaa-women-538-team-ratings by _Raddar_.
- ESPN WNCAA votes: https://www.kaggle.com/datasets/raddar/wncaa-basketball-espn-ratings by _Raddar_.