In [1]:
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import linregress
from tqdm import tqdm

import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd

op.logging.set_verbosity(op.logging.WARNING)



In [2]:
DATA_DIR = '/kaggle/input/march-machine-learning-mania-2024'

In [3]:
CSV = {}

for path in glob.glob(DATA_DIR + "/*.csv"):
    CSV[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')


In [4]:
def device():
    from tensorflow.python.client import device_lib

    return 'gpu' if len(list(filter(lambda x: x.device_type == 'GPU', device_lib.list_local_devices()))) > 0 else 'cpu'

In [5]:
print("Device: {}".format(device()))

2024-03-20 15:30:06.594871: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-20 15:30:06.594982: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-20 15:30:06.739795: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: gpu


In [6]:
def build_results(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))
    
    return pd.concat(csvs)

In [7]:
results_m = build_results('M')
results_w = build_results('W')

display(results_m)
display(results_w)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
187284,2024,132,1120,86,1196,67,N,0
187285,2024,132,1182,57,1433,51,N,0
187286,2024,132,1228,93,1458,87,N,0
187287,2024,132,1412,85,1396,69,N,0


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1998,137,3104,94,3422,46,H,0
1,1998,137,3112,75,3365,63,H,0
2,1998,137,3163,93,3193,52,H,0
3,1998,137,3198,59,3266,45,H,0
4,1998,137,3203,74,3208,72,A,0
...,...,...,...,...,...,...,...,...
131582,2024,131,3465,75,3372,74,N,0
131583,2024,132,3179,76,3283,75,N,0
131584,2024,132,3180,68,3392,60,N,0
131585,2024,132,3221,61,3131,55,H,0


In [8]:
def build_teams(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams.drop('TeamName', axis=1)
    teams = teams.set_index('TeamID')
    
    return teams

In [9]:
teams_m = build_teams('M')
teams_w = build_teams('W') # FIXME: Maybe useless since there is no data aside from TeamName.

display(teams_m)
display(teams_w)

Unnamed: 0_level_0,FirstD1Season,LastD1Season
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1
1101,2014,2024
1102,1985,2024
1103,1985,2024
1104,1985,2024
1105,2000,2024
...,...,...
1474,2023,2024
1475,2023,2024
1476,2023,2024
1477,2023,2024


3101
3102
3103
3104
3105
...
3474
3475
3476
3477
3478


In [10]:
def calculate_elo(teams, data, initial_rating=2000, k=140, alpha=None):
    '''
    Calculate Elo ratings for each team based on match data.

    Parameters:
    - teams (array-like): チームの ID リスト
    - data (pd.DataFrame): 試合データの DataFrame (列には WTeamID, LTeamID, WScore, LScore などがある)
    - initial_rating (float): 初期レーティング (ランクがない場合の初期値、default: 2000).
    - k (float): K ファクター (1 試合あたりのレーティング変動への影響度、default: 140).
    - alpha (float or None):  勝敗点差の重み付け係数　(Noneの場合は重み付けなし)

    Returns: 
    - list: Historical ratings of the winning team (WTeam).
    - list: Historical ratings of the losing team (LTeam).
    '''
    
    # Dictionary to keep track of current ratings for each team
    team_dict = {}
    for team in teams:
        team_dict[team] = initial_rating
        
    # Lists to store ratings for each team in each game
    r1, r2 = [], []
    margin_of_victory = 1

    # Iterate through the game data
    for wteam, lteam, ws, ls  in tqdm(zip(data.WTeamID, data.LTeamID, data.WScore, data.LScore), total=len(data)):
        # Append current ratings for teams to lists
        r1.append(team_dict[wteam])
        r2.append(team_dict[lteam])

        # Calculate expected outcomes based on Elo ratings
        rateW = 1 / (1 + 10 ** ((team_dict[lteam] - team_dict[wteam]) / initial_rating))
        rateL = 1 / (1 + 10 ** ((team_dict[wteam] - team_dict[lteam]) / initial_rating))
        
        if alpha:
            margin_of_victory = (ws - ls)/alpha

        # Update ratings for winning and losing teams
        team_dict[wteam] += k * margin_of_victory * (1 - rateW)
        team_dict[lteam] += k * margin_of_victory * (0 - rateL)

        # Ensure that ratings do not go below 1
        if team_dict[lteam] < 1:
            team_dict[lteam] = 1
        
    return r1, r2

def create_elo_data(teams, data, initial_rating=2000, k=140, alpha=None):
    '''
    Create a DataFrame with summary statistics of Elo ratings for teams based on historical match data.

    Parameters:
    - teams (array-like): チームの ID リスト
    - data (pd.DataFrame): 試合データの DataFrame (列には WTeamID, LTeamID, WScore, LScore などがある)
    - initial_rating (float): 初期レーティング (ランクがない場合の初期値)
    - k (float): K ファクター (1 試合あたりのレーティング変動への影響度、default: 140).

    Returns: 
    - DataFrame: Summary statistics of Elo ratings for teams throughout a season.
    '''
    
    r1, r2 = calculate_elo(teams, data, initial_rating, k, alpha)
    
    # Concatenate arrays vertically
    seasons = np.concatenate([data.Season, data.Season])
    days = np.concatenate([data.DayNum, data.DayNum])
    teams = np.concatenate([data.WTeamID, data.LTeamID])
    tourney = np.concatenate([data.tourney, data.tourney])
    ratings = np.concatenate([r1, r2])
    # Create a DataFrame
    rating_df = pd.DataFrame({
        'Season': seasons,
        'DayNum': days,
        'TeamID': teams,
        'Rating': ratings,
        'Tourney': tourney
    })

    # Sort DataFrame and remove tournament data
    rating_df.sort_values(['TeamID', 'Season', 'DayNum'], inplace=True)
    rating_df = rating_df[rating_df['Tourney'] == 0]
    grouped = rating_df.groupby(['TeamID', 'Season'])
    results = grouped['Rating'].agg(['mean', 'median', 'std', 'min', 'max', 'last'])
    results.columns = ['Rating_Mean', 'Rating_Median', 'Rating_Std', 'Rating_Min', 'Rating_Max', 'Rating_Last']
    results['Rating_Trend'] = grouped.apply(lambda x: linregress(range(len(x)), x['Rating']).slope, include_groups=False)
    results.reset_index(inplace=True)
    
    return results

In [11]:
def build_elo(gender, results, teams):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: CSV[x], csv_names))

    tourneys = results.copy()
    tourneys['tourney'] = 0
    tourneys.loc[len(csvs[0]):, 'tourney'] = 1
    tourneys = tourneys.sort_values(['Season', 'DayNum'])
    
    return create_elo_data(teams.reset_index().TeamID, tourneys).drop('Season', axis=1).groupby('TeamID').mean()

In [12]:
elo_m = build_elo('M', results_m, teams_m)
elo_w = build_elo('W', results_w, teams_w)

display(elo_m)
display(elo_w)

100%|██████████| 189740/189740 [00:00<00:00, 309975.57it/s]
  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
100%|██████████| 133170/133170 [00:00<00:00, 320428.07it/s]
  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


Unnamed: 0_level_0,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1101,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628
1102,2678.124707,2686.604784,214.613224,2561.394128,2763.590190,2561.394128,-43.897197
1103,2750.753871,2747.839316,139.884844,2711.145350,2786.167517,2745.404935,-19.813962
1104,3752.543112,3749.593246,59.858238,3704.294710,3804.856073,3804.856073,52.887944
1105,1108.014030,1108.014030,,1108.014030,1108.014030,1108.014030,
...,...,...,...,...,...,...,...
1460,2521.630842,2521.630842,36.888624,2515.109793,2528.151891,2528.151891,52.168392
1461,2996.400014,2993.142619,74.402542,2956.520610,3036.892349,3035.401649,64.841331
1462,3673.402601,3672.555650,55.840404,3635.467714,3712.211378,3709.733564,55.285432
1463,2585.593927,2586.981613,73.491924,2542.662895,2630.514606,2588.214276,59.662786


Unnamed: 0_level_0,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3101,2082.192135,2082.192135,,2082.192135,2082.192135,2082.192135,
3102,1984.508622,1988.790357,50.106277,1922.191344,2067.621037,1925.042554,-5.185493
3103,2398.798202,2411.812358,76.239688,2320.285649,2447.995900,2396.330833,-9.812588
3104,3343.228764,3303.620011,139.048957,3268.246483,3529.050165,3529.050165,61.916965
3106,1703.142594,1706.371227,54.592245,1687.099354,1725.899390,1687.099354,1.732950
...,...,...,...,...,...,...,...
3460,2693.141781,2685.545374,99.721223,2641.067568,2762.861693,2678.396161,45.100111
3461,2702.648620,2699.856369,82.784696,2658.883327,2745.437872,2678.542297,-26.319625
3462,3642.380183,3641.255421,57.088424,3611.791013,3676.888721,3669.153178,43.484173
3463,1964.640460,1997.180859,78.850412,1816.791878,2070.000000,1898.964063,-19.185057


In [13]:
def winner(ids):
    id, wId, lId = ids

    return int(id == wId)

def opponent(x):
    winInt, wId, lId = x
    win = not winInt
    
    return wId if win else lId

def score_diff(x):
    winInt, wScore, lScore = x
    win = not winInt
    
    return (wScore - lScore) if win else (lScore - wScore)


In [14]:
def build_season_results(df):
    season_results = df
    season_results['TeamID'] = season_results[['WTeamID', 'LTeamID']].values.tolist()
    season_results = season_results.explode('TeamID')
    season_results['Win'] = season_results[['TeamID', 'WTeamID', 'LTeamID']].apply(winner, axis=1)
    season_results['Defeat'] = season_results['Win'].apply(lambda x: 1 - x)
    season_results['Games'] = season_results['Win'] + season_results['Defeat']
    season_results['ScoreDiff'] = season_results[['Win', 'WScore', 'LScore']].apply(score_diff, axis=1)
    season_results['OTeamID'] = season_results[['Win', 'WTeamID', 'LTeamID']].apply(opponent, axis=1)
    season_results['Home'] = season_results['WLoc'].apply(lambda x: int(x[0] == 'H'))
    season_results = season_results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc', 'NumOT'], axis=1)
    season_results = season_results.groupby(by=['TeamID', 'OTeamID']).sum()
    season_results['WinRatio'] = season_results['Win'] / season_results['Games']
    season_results = season_results.drop(['Win', 'Defeat'], axis=1)

    return season_results

In [15]:
season_results_m = build_season_results(results_m)
season_results_w = build_season_results(results_w)

display(season_results_m)
display(season_results_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1101,1102,1,-4,0,1.0
1101,1115,1,-8,0,1.0
1101,1116,2,23,2,0.0
1101,1117,2,-7,2,0.5
1101,1122,1,-8,0,1.0
...,...,...,...,...,...
1478,1384,2,-53,1,1.0
1478,1437,1,26,1,0.0
1478,1447,2,9,2,0.5
1478,1467,3,30,2,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3101,3102,1,-36,1,1.0
3101,3106,1,-11,1,1.0
3101,3114,1,-5,0,1.0
3101,3116,2,26,1,0.0
3101,3117,1,-12,0,1.0
...,...,...,...,...,...
3478,3425,1,51,1,0.0
3478,3433,1,23,1,0.0
3478,3447,2,-20,1,1.0
3478,3467,2,-22,1,1.0


In [16]:
def build_rpi(results):
    win_pct = results.copy()[['WinRatio']]
    win_pct = win_pct.groupby('TeamID').mean()
    win_pct['WP'] = win_pct['WinRatio'] * 100
    win_pct = win_pct.drop('WinRatio', axis=1)
    
    rpi = results.copy().reset_index()
    rpi = pd.merge(rpi, win_pct, on=['TeamID'])
    rpi = pd.merge(rpi, win_pct, left_on=['OTeamID'], right_on=['TeamID'], suffixes=('_T', '_O'))
    
    wp_oo = rpi[['TeamID', 'WP_O']].groupby('TeamID').mean()
    wp_oo = wp_oo.rename(columns={'WP_O': 'WP_OO'})
    
    rpi = pd.merge(rpi, wp_oo, left_on=['OTeamID'], right_on=['TeamID'])

    rpi['RPI'] = (rpi['WP_T'] * 0.25) + (rpi['WP_O'] * 0.50) + (rpi['WP_OO'] * 0.25)
    
    return rpi[['TeamID', 'OTeamID', 'RPI']].set_index(['TeamID', 'OTeamID'])


In [17]:
rpi_m = build_rpi(season_results_m)
rpi_w = build_rpi(season_results_w)

display(rpi_m)
display(rpi_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,RPI
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,45.828458
1101,1115,30.684414
1101,1116,61.884873
1101,1117,45.986946
1101,1122,41.512802
...,...,...
1478,1384,33.573010
1478,1437,60.459914
1478,1447,38.733471
1478,1467,37.660161


Unnamed: 0_level_0,Unnamed: 1_level_0,RPI
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,36.732579
3101,3106,36.316760
3101,3114,52.314382
3101,3116,64.301155
3101,3117,48.628775
...,...,...
3478,3425,57.460211
3478,3433,50.493780
3478,3447,32.634013
3478,3467,38.772264


In [18]:
def clean_seeds(seed):
    res = seed[1:]

    if len(res) > 2:
        res = res[:-1]

    return int(res)

def build_seeds(gender):
    seeds = CSV["{}NCAATourneySeeds".format(gender)] 
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)
    seeds = seeds.drop('Season', axis=1)
    seeds = seeds.groupby(by='TeamID').mean()
    
    return seeds

In [19]:
seeds_m = build_seeds('M')
seeds_w = build_seeds('W')

display(seeds_m)
display(seeds_w)

Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
1101,14.500000
1102,12.000000
1103,13.666667
1104,5.800000
1105,16.000000
...,...
1459,12.200000
1460,15.000000
1461,10.800000
1462,8.074074


Unnamed: 0_level_0,Seed
TeamID,Unnamed: 1_level_1
3101,16.000000
3103,13.000000
3104,6.400000
3106,15.333333
3107,14.285714
...,...
3460,13.333333
3461,12.500000
3462,5.888889
3464,13.500000


In [20]:
def build_rankings(gender):
    rankings = CSV["{}MasseyOrdinals_thruSeason2024_day128".format(gender)]
    rankings = rankings.drop(['SystemName', 'RankingDayNum'], axis=1)
    rankings = rankings.groupby(by='TeamID').mean()
    rankings = rankings.drop('Season', axis=1)

    return rankings

In [21]:
rankings_m = build_rankings('M')

rankings_m

Unnamed: 0_level_0,OrdinalRank
TeamID,Unnamed: 1_level_1
1101,231.697488
1102,184.409115
1103,111.700294
1104,55.244180
1105,320.080397
...,...
1474,229.931136
1475,290.941176
1476,330.658983
1477,296.072941


In [22]:
def build_history(season_results, seeds, teams, elo, rpi, rankings=None):
    history = season_results.join(teams, on='TeamID').join(seeds, on='TeamID').join(elo, on='TeamID').join(rpi, on=['TeamID', 'OTeamID'])
    history = history.reset_index()
    history = pd.merge(history, rpi.reset_index().rename(columns={'TeamID': 'OTeamID', 'OTeamID': 'TeamID'}), on=['TeamID', 'OTeamID'], suffixes=('_T', '_O'))
    history = pd.merge(history, seeds, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
    history['RPIDiff'] = history['RPI_T'] - history['RPI_O']
    history['SeedDiff'] = history['Seed_T'] - history['Seed_O']
    history = history.drop(['RPI_T', 'RPI_O', 'Seed_T', 'Seed_O'], axis=1)

    if rankings is not None:
        history = history.join(rankings, on='TeamID')
        history = pd.merge(history, rankings, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
        history['RankingsDiff'] = history['OrdinalRank_T'] - history['OrdinalRank_O']
        history = history.drop(['OrdinalRank_T', 'OrdinalRank_O'], axis=1)
    
    return history.set_index(['TeamID', 'OTeamID']).fillna(0)


In [23]:
history_m = build_history(season_results_m, seeds_m, teams_m, elo_m, rpi_m, rankings_m)
history_w = build_history(season_results_w, seeds_w, teams_w, elo_w, rpi_w)

display(history_m)
display(history_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,RankingsDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1101,1102,1,-4,0,1.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,1.288411,2.50000,47.288373
1101,1115,1,-8,0,1.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,-5.313000,-1.50000,-86.081311
1101,1116,2,23,2,0.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,9.768298,8.23913,171.700251
1101,1117,2,-7,2,0.5,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,1.686482,-0.50000,21.644468
1101,1122,1,-8,0,1.0,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,-0.583154,0.10000,11.552984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1360,1,9,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.850165,0.00000,132.113963
1478,1364,1,24,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.261240,0.00000,166.886715
1478,1384,2,-53,1,1.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005356,0.00000,43.080533
1478,1437,1,26,1,0.0,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,14.108753,0.00000,289.065047


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3101,3106,1,-11,1,1.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-6.781776,0.666667
3101,3114,1,-5,0,1.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,0.744372,3.666667
3101,3116,2,26,1,0.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,7.345556,8.375000
3101,3124,2,102,2,0.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,10.620274,13.227273
3101,3146,15,-16,7,0.533333,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-3.223206,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3357,3,35,1,0.333333,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.843656,0.000000
3478,3384,2,-29,2,0.500000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,-1.601386,0.000000
3478,3392,1,21,0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,2.805564,0.000000
3478,3425,1,51,1,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,9.538866,0.000000


In [24]:
def build_avg(history):
    agg = {}
    for col in history.columns:
        if col == 'Games' or col == 'Home':
            agg[col] = 'sum'
        else:
            agg[col] = 'mean'
    
    avg = history.groupby('TeamID').agg(agg)
    
    return avg

In [25]:
avg_m = build_avg(history_m)
avg_w = build_avg(history_w)

display(avg_m)
display(avg_w)

Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,RankingsDiff
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1101,206,10.260274,122,0.345951,2014.0,2024.0,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,2.920261,2.854736,64.912457
1102,986,32.237762,571,0.448868,1985.0,2024.0,2678.124707,2686.604784,214.613224,2561.394128,2763.590190,2561.394128,-43.897197,1.283941,0.227426,16.066182
1103,964,-15.184211,596,0.561764,1985.0,2024.0,2750.753871,2747.839316,139.884844,2711.145350,2786.167517,2745.404935,-19.813962,-1.640124,1.819546,-57.453900
1104,1253,-28.137755,728,0.708621,1985.0,2024.0,3752.543112,3749.593246,59.858238,3704.294710,3804.856073,3804.856073,52.887944,-5.166689,-4.865758,-90.521333
1105,651,40.947368,414,0.123476,2000.0,2024.0,1108.014030,1108.014030,0.000000,1108.014030,1108.014030,1108.014030,0.000000,9.596492,4.911349,163.456069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,51,7.928571,35,0.340476,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.769840,0.000000,39.919485
1475,33,12.187500,22,0.233333,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.042671,0.000000,133.143434
1476,40,18.772727,29,0.159091,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.643990,0.000000,149.336492
1477,49,15.695652,25,0.304348,2023.0,2024.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.037645,0.000000,115.562310


Unnamed: 0_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3101,206,-6.666667,117,0.487205,2082.192135,2082.192135,0.000000,2082.192135,2082.192135,2082.192135,0.000000,-0.304536,3.591174
3102,624,96.875000,341,0.196698,1984.508622,1988.790357,50.106277,1922.191344,2067.621037,1925.042554,-5.185493,7.636443,0.000000
3103,694,36.403846,384,0.375529,2398.798202,2411.812358,76.239688,2320.285649,2447.995900,2396.330833,-9.812588,2.320563,1.718256
3104,729,5.838028,377,0.660339,3343.228764,3303.620011,139.048957,3268.246483,3529.050165,3529.050165,61.916965,-3.806303,-4.201373
3105,563,60.760000,348,0.288430,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.189256,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,34,31.842105,20,0.122807,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.687261,0.000000
3475,35,2.428571,19,0.410714,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.839249,0.000000
3476,32,27.750000,17,0.140625,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.112778,0.000000
3477,51,19.450000,30,0.228333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.795127,0.000000


In [26]:
def build_matchups(gender):
    teams = CSV["{}Teams".format(gender)].copy()
    teams = teams[['TeamID']]
    teams = pd.merge(teams, teams, how='cross')
    teams = teams.rename(columns={'TeamID_x': 'TeamID', 'TeamID_y': 'OTeamID'})
    teams = teams[teams['TeamID'] != teams['OTeamID']]
    teams = teams.set_index(['TeamID', 'OTeamID'])

    return teams

In [27]:
matchups_m = build_matchups('M')
matchups_w = build_matchups('W')

display(matchups_m)
display(matchups_w)

TeamID,OTeamID
1101,1102
1101,1103
1101,1104
1101,1105
1101,1106
...,...
1478,1473
1478,1474
1478,1475
1478,1476


TeamID,OTeamID
3101,3102
3101,3103
3101,3104
3101,3105
3101,3106
...,...
3478,3473
3478,3474
3478,3475
3478,3476


In [28]:
def build_df(history, matchups, avg):
    df = pd.merge(matchups, history, on=['TeamID', 'OTeamID'], how='left')
    df = df.fillna(avg).fillna(0)

    if 'FirstD1Season' in df.columns:
        df['FirstD1Season'] = df['FirstD1Season'].astype(int)
        df['LastD1Season'] = df['LastD1Season'].astype(int)
    
    return df

In [29]:
df_m = build_df(history_m, matchups_m, avg_m)
df_w = build_df(history_w, matchups_w, avg_w)

display(df_m)
display(df_w)

Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,RankingsDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1101,1102,1.0,-4.000000,0.0,1.000000,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,1.288411,2.500000,47.288373
1101,1103,206.0,10.260274,122.0,0.345951,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,2.920261,2.854736,64.912457
1101,1104,206.0,10.260274,122.0,0.345951,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,2.920261,2.854736,64.912457
1101,1105,206.0,10.260274,122.0,0.345951,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,2.920261,2.854736,64.912457
1101,1106,206.0,10.260274,122.0,0.345951,2014,2024,2236.806245,2236.806245,90.261918,2204.893838,2268.718652,2268.718652,127.649628,2.920261,2.854736,64.912457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1473,20.0,3.500000,15.0,0.333333,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.565014,0.000000,119.210892
1478,1474,20.0,3.500000,15.0,0.333333,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.565014,0.000000,119.210892
1478,1475,20.0,3.500000,15.0,0.333333,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.565014,0.000000,119.210892
1478,1476,20.0,3.500000,15.0,0.333333,2024,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.565014,0.000000,119.210892


Unnamed: 0_level_0,Unnamed: 1_level_0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff
TeamID,OTeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3101,3102,206.0,-6.666667,117.0,0.487205,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-0.304536,3.591174
3101,3103,206.0,-6.666667,117.0,0.487205,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-0.304536,3.591174
3101,3104,206.0,-6.666667,117.0,0.487205,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-0.304536,3.591174
3101,3105,206.0,-6.666667,117.0,0.487205,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-0.304536,3.591174
3101,3106,1.0,-11.000000,1.0,1.000000,2082.192135,2082.192135,0.0,2082.192135,2082.192135,2082.192135,0.0,-6.781776,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3478,3473,18.0,15.714286,12.0,0.202381,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,3.849500,0.000000
3478,3474,18.0,15.714286,12.0,0.202381,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,3.849500,0.000000
3478,3475,18.0,15.714286,12.0,0.202381,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,3.849500,0.000000
3478,3476,18.0,15.714286,12.0,0.202381,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,3.849500,0.000000


In [30]:
corr_m = df_m.corr()
corr_m.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,FirstD1Season,LastD1Season,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff,RankingsDiff
Games,1.0,-0.036121,0.996622,0.088552,-0.283065,0.164016,0.218153,0.218112,0.144127,0.215271,0.220241,0.216341,0.087925,-0.11455,-0.030288,-0.087976
ScoreDiff,-0.036121,1.0,-0.019894,-0.489685,0.086247,-0.057871,-0.338398,-0.33892,0.087362,-0.34635,-0.328026,-0.342424,-0.233237,0.500118,0.402582,0.495142
Home,0.996622,-0.019894,1.0,0.069671,-0.277249,0.161445,0.193721,0.19364,0.149394,0.190418,0.196444,0.191586,0.071232,-0.089964,-0.01099,-0.063291
WinRatio,0.088552,-0.489685,0.069671,1.0,-0.216265,0.133849,0.524536,0.525139,-0.062001,0.530117,0.51558,0.527435,0.295852,-0.768027,-0.627851,-0.725583
FirstD1Season,-0.283065,0.086247,-0.277249,-0.216265,1.0,0.019527,-0.62942,-0.6286,-0.566708,-0.607875,-0.651119,-0.61723,-0.186541,0.278956,0.117401,0.297161
LastD1Season,0.164016,-0.057871,0.161445,0.133849,0.019527,1.0,0.199466,0.200371,-0.082981,0.216299,0.178025,0.211726,0.221572,-0.173126,0.019098,0.007238
Rating_Mean,0.218153,-0.338398,0.193721,0.524536,-0.62942,0.199466,1.0,0.999958,0.178308,0.998491,0.997882,0.998178,0.483841,-0.674846,-0.457319,-0.655921
Rating_Median,0.218112,-0.33892,0.19364,0.525139,-0.6286,0.200371,0.999958,1.0,0.176076,0.998577,0.997554,0.99813,0.484838,-0.675676,-0.456847,-0.656357
Rating_Std,0.144127,0.087362,0.149394,-0.062001,-0.566708,-0.082981,0.178308,0.176076,1.0,0.139042,0.223419,0.15728,-0.118812,0.087279,0.150868,0.056375
Rating_Min,0.215271,-0.34635,0.190418,0.530117,-0.607875,0.216299,0.998491,0.998577,0.139042,1.0,0.992973,0.998174,0.500527,-0.682531,-0.457471,-0.661187


In [31]:
corr_w = df_w.corr()
corr_w.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Games,ScoreDiff,Home,WinRatio,Rating_Mean,Rating_Median,Rating_Std,Rating_Min,Rating_Max,Rating_Last,Rating_Trend,RPIDiff,SeedDiff
Games,1.0,-0.048586,0.995923,0.162481,0.302945,0.302828,0.185533,0.297,0.308391,0.299777,0.075863,-0.127086,-0.064229
ScoreDiff,-0.048586,1.0,-0.032034,-0.517523,-0.385462,-0.385281,0.027806,-0.392744,-0.377735,-0.391705,-0.325142,0.558571,0.370113
Home,0.995923,-0.032034,1.0,0.144165,0.280853,0.280718,0.187917,0.27473,0.28667,0.277877,0.058318,-0.102901,-0.046382
WinRatio,0.162481,-0.517523,0.144165,1.0,0.599356,0.598962,0.056286,0.602045,0.595959,0.60497,0.427834,-0.755011,-0.571182
Rating_Mean,0.302945,-0.385462,0.280853,0.599356,1.0,0.999967,0.245801,0.999073,0.998081,0.997561,0.506668,-0.650055,-0.42465
Rating_Median,0.302828,-0.385281,0.280718,0.598962,0.999967,1.0,0.243418,0.999111,0.997775,0.99729,0.506094,-0.649584,-0.423741
Rating_Std,0.185533,0.027806,0.187917,0.056286,0.245801,0.243418,1.0,0.219347,0.287163,0.253683,-0.044652,0.00819,0.148865
Rating_Min,0.297,-0.392744,0.27473,0.602045,0.999073,0.999111,0.219347,1.0,0.994958,0.996941,0.517314,-0.656059,-0.423283
Rating_Max,0.308391,-0.377735,0.28667,0.595959,0.998081,0.997775,0.287163,0.994958,1.0,0.997111,0.499393,-0.643327,-0.426588
Rating_Last,0.299777,-0.391705,0.277877,0.60497,0.997561,0.99729,0.253683,0.996941,0.997111,1.0,0.539717,-0.659256,-0.432428


In [32]:
corr_m = df_m.corr()['WinRatio'].sort_values(ascending=False)
high_corr_m = corr_m[[abs(corr_m) > 0.1 for corr_m in corr_m]]

corr_w = df_w.corr()['WinRatio'].sort_values(ascending=False)
high_corr_w = corr_w[[abs(corr_w) > 0.1 for corr_w in corr_w]]

display(high_corr_m)
display(high_corr_w)

WinRatio         1.000000
Rating_Min       0.530117
Rating_Last      0.527435
Rating_Median    0.525139
Rating_Mean      0.524536
Rating_Max       0.515580
Rating_Trend     0.295852
LastD1Season     0.133849
FirstD1Season   -0.216265
ScoreDiff       -0.489685
SeedDiff        -0.627851
RankingsDiff    -0.725583
RPIDiff         -0.768027
Name: WinRatio, dtype: float64

WinRatio         1.000000
Rating_Last      0.604970
Rating_Min       0.602045
Rating_Mean      0.599356
Rating_Median    0.598962
Rating_Max       0.595959
Rating_Trend     0.427834
Games            0.162481
Home             0.144165
ScoreDiff       -0.517523
SeedDiff        -0.571182
RPIDiff         -0.755011
Name: WinRatio, dtype: float64

In [33]:
def score_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()

    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': device(),
        'verbose': -1
    }

    return score_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, n_jobs=-1, show_progress_bar=True)

    return study.best_params


In [34]:
def build_x_y(df):
    target_column = 'WinRatio'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    
    return df[feature_columns], df[target_column]

In [35]:
X_m, y_m = build_x_y(df_m)
X_w, y_w = build_x_y(df_w)

In [36]:
params_m = study(X_m, y_m)
params_w = study(X_w, y_w)

  0%|          | 0/100 [00:00<?, ?it/s]



  0%|          | 0/100 [00:00<?, ?it/s]

In [37]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))

In [38]:
accuracy(X_m, y_m, params_m)
accuracy(X_w, y_w, params_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3257
[LightGBM] [Info] Number of data points in the train set: 99754, number of used features: 15
[LightGBM] [Info] Start training from score 0.437588
LightGBM Model accuracy score: 0.9323
LightGBM Model accuracy score [train]: 0.9327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2956
[LightGBM] [Info] Number of data points in the train set: 98700, number of used features: 12
[LightGBM] [Info] Start training from score 0.412207
LightGBM Model accuracy score: 0.9351
LightGBM Model accuracy score [train]: 0.9351


In [39]:
def build_wins(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)

    wins = X
    wins['WinRatio'] = reg.predict(X)
    wins = wins[['WinRatio']]

    return wins

In [40]:
wins_m = build_wins(X_m, y_m, params_m)
wins_w = build_wins(X_w, y_w, params_w)

display(wins_m)
display(wins_w)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3324
[LightGBM] [Info] Number of data points in the train set: 142506, number of used features: 15
[LightGBM] [Info] Start training from score 0.437847
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3021
[LightGBM] [Info] Number of data points in the train set: 141000, number of used features: 12
[LightGBM] [Info] Start training from score 0.412089


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
1101,1102,0.923675
1101,1103,0.376781
1101,1104,0.376781
1101,1105,0.376781
1101,1106,0.376781
...,...,...
1478,1473,0.333175
1478,1474,0.333175
1478,1475,0.333175
1478,1476,0.333175


Unnamed: 0_level_0,Unnamed: 1_level_0,WinRatio
TeamID,OTeamID,Unnamed: 2_level_1
3101,3102,0.484781
3101,3103,0.484781
3101,3104,0.484781
3101,3105,0.484781
3101,3106,0.903229
...,...,...
3478,3473,0.273281
3478,3474,0.273281
3478,3475,0.273281
3478,3476,0.273281


In [41]:
def build_slots(gender):
    slots = CSV["{}NCAATourneySlots".format(gender)]
    year = np.max(slots['Season'])
    
    slots = slots[slots['Season'] == year]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

In [42]:
slots_m = build_slots('M')
slots_w = build_slots('W')

display(slots_m)
display(slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2452,2024,R1W1,W01,W16
2453,2024,R1W2,W02,W15
2454,2024,R1W3,W03,W14
2455,2024,R1W4,W04,W13
2456,2024,R1W5,W05,W12
...,...,...,...,...
2510,2024,R4Y1,R3Y1,R3Y2
2511,2024,R4Z1,R3Z1,R3Z2
2512,2024,R5WX,R4W1,R4X1
2513,2024,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1646,2024,R1W1,W01,W16
1647,2024,R1W2,W02,W15
1648,2024,R1W3,W03,W14
1649,2024,R1W4,W04,W13
1650,2024,R1W5,W05,W12
...,...,...,...,...
1704,2024,R4Y1,R3Y1,R3Y2
1705,2024,R4Z1,R3Z1,R3Z2
1706,2024,R5WX,R4W1,R4X1
1707,2024,R5YZ,R4Y1,R4Z1


In [43]:
def build_seeds_2024():
    seeds_2024 = CSV['2024_tourney_seeds']

    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

In [44]:
seeds_2024_m, seeds_2024_w = build_seeds_2024()

display(seeds_2024_m)
display(seeds_2024_w)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1163
1,M,W02,1235
2,M,W03,1228
3,M,W04,1120
4,M,W05,1361
...,...,...,...
59,M,Z12,1241
60,M,Z13,1436
61,M,Z14,1324
62,M,Z15,1443


Unnamed: 0,Tournament,Seed,TeamID
64,W,W01,3376
65,W,W02,3323
66,W,W03,3333
67,W,W04,3231
68,W,W05,3328
...,...,...,...
123,W,Z12,3162
124,W,Z13,3267
125,W,Z14,3238
126,W,Z15,3263


In [45]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, wins):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - wins (DF): DF that includes wins prediction per matchup.
    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]

        team_1_prob = wins.loc[team_1, team_2].WinRatio
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, wins, brackets):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - wins (DF): DF that includes wins prediction per matchup.
    - brackets (int): Number of brackets to simulate.
    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, wins)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [46]:
n_brackets = 100000
result_m = run_simulation(seeds_2024_m, slots_m, wins_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')
result_w = run_simulation(seeds_2024_w, slots_w, wins_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 100000/100000 [15:05<00:00, 110.49it/s]
100%|██████████| 100000/100000 [15:00<00:00, 111.05it/s]


In [47]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W15
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W12
...,...,...,...,...
12599995,W,100000,R4Y1,Y03
12599996,W,100000,R4Z1,Z03
12599997,W,100000,R5WX,W11
12599998,W,100000,R5YZ,Y03


In [48]:
submission.to_csv('submission.csv')