In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss

## Data Prep

In [2]:
pdf = pd.read_csv('ncaaw_risky_full.csv').drop('ID', axis=1)

In [3]:
directory = 'WDataFiles_Stage2'
df_seeds = pd.read_csv(f'{directory}/WNCAATourneySeeds.csv')
df_seeds = df_seeds[df_seeds['Season'] == 2022].reset_index(drop=True).drop('Season', axis=1)

In [4]:
seeds_dict = {row[1]: row[0] for row in df_seeds.values}

In [5]:
regions = df_seeds['Seed'].apply(lambda x: x[0]).unique()
regions

array(['W', 'X', 'Y', 'Z'], dtype=object)

In [6]:
for s in ['A', 'B']:
    pdf[f'{s}Seed'] = pdf[f'{s}TeamID'].map(seeds_dict)
pdf.head()

Unnamed: 0,Pred,ATeam,ATeamID,BTeam,BTeamID,ASeed,BSeed
0,0.483309,Albany (NY),3107,American,3110,Z16,Z14
1,0.011496,Albany (NY),3107,Arizona,3112,Z16,Y04
2,0.080946,Albany (NY),3107,Arkansas,3116,Z16,X10
3,0.000845,Albany (NY),3107,Baylor,3124,Z16,Z02
4,0.248177,Albany (NY),3107,Belmont,3125,Z16,Z12


## Round-by-Round Recap

### First Four

In [7]:
first_four_eliminations = ['Incarnate Word', 'DePaul', "Mount St. Mary's", 'Florida State']
pdf = pdf[(~(pdf.ATeam.isin(first_four_eliminations))) & 
          (~(pdf.BTeam.isin(first_four_eliminations)))]
pdf.shape

(2016, 7)

### 1st Round

In [8]:
dfs = []
for region in regions:
    for i in range(1, 9):
        up_seed = i
        down_seed = 16 - i + 1
        up_seed = f'{region}0{up_seed}'
        if down_seed == 9:
            down_seed = f'{region}0{down_seed}'
        else:
            down_seed = f'{region}{down_seed}'
        matchup_cond1 = (pdf.ASeed.str.contains(up_seed)) & (pdf.BSeed.str.contains(down_seed))
        matchup_cond2 = (pdf.BSeed.str.contains(up_seed)) & (pdf.ASeed.str.contains(down_seed))
        dfs.append(pdf[(matchup_cond1) | (matchup_cond2)])
first_round = pd.concat(dfs)[['ATeam', 'ASeed', 'BTeam', 'BSeed', 'Pred']].reset_index(drop=True)
first_round['Res'] = np.nan
first_round

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res
0,Longwood,W16a,North Carolina State,W01,1e-05,
1,Connecticut,W02,Mercer,W15,0.99999,
2,Charlotte,W14,Indiana,W03,1e-05,
3,IUPUI,W13,Oklahoma,W04,0.353599,
4,Massachusetts,W12,Notre Dame,W05,0.362203,
5,Kentucky,W06,Princeton,W11,0.465116,
6,Florida,W10,Central Florida,W07,0.284229,
7,Kansas State,W09,Washington State,W08,0.545299,
8,Montana State,X16,Stanford,X01,1e-05,
9,Fairfield,X15,Texas,X02,1e-05,


In [9]:
day_1 = first_round.copy()
day_1.loc[[23, 12, 31, 25, 18], 'Res'] = 1
day_1.loc[[22, 16, 17, 11, 14, 24, 21, 15, 9, 8, 30], 'Res'] = 0
day_1 = day_1[~(day_1['Res'].isna())]
day_1['log_loss'] = day_1.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_1.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
15,Georgia Tech,X09,Kansas,X08,0.673849,0.0,1.120394
12,Florida Gulf Coast,X12,Virginia Tech,X05,0.400765,1.0,0.91438
23,Miami (FL),Y08,South Florida,Y09,0.424494,1.0,0.856857
14,Arkansas,X10,Utah,X07,0.53863,0.0,0.773554
30,Mississippi,Z07,South Dakota,Z10,0.521937,0.0,0.738013
22,Colorado,Y07,Creighton,Y10,0.449913,0.0,0.597678
31,Gonzaga,Z09,Nebraska,Z08,0.55077,1.0,0.596438
21,Dayton,Y11a,Georgia,Y06,0.38099,0.0,0.479634
11,Delaware,X13,Maryland,X04,0.056461,0.0,0.058117
8,Montana State,X16,Stanford,X01,1e-05,0.0,1e-05


In [10]:
day_1.log_loss.mean()

0.38344603067150146

In [11]:
day_2 = first_round.copy()
day_2.loc[[7, 1, 28, 20, 19], 'Res'] = 1
day_2.loc[[29, 2, 0, 13, 27, 26, 5, 6, 10, 4, 3], 'Res'] = 0
day_2 = day_2[~(day_2['Res'].isna())]
day_2['log_loss'] = day_2.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_2.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
29,Brigham Young,Z06,Villanova,Z11,0.773373,0.0,1.484452
28,Belmont,Z12,Oregon,Z05,0.271256,1.0,1.304692
5,Kentucky,W06,Princeton,W11,0.465116,0.0,0.625706
7,Kansas State,W09,Washington State,W08,0.545299,1.0,0.606421
13,Missouri State,X11b,Ohio State,X06,0.405107,0.0,0.519373
4,Massachusetts,W12,Notre Dame,W05,0.362203,0.0,0.449735
3,IUPUI,W13,Oklahoma,W04,0.353599,0.0,0.436336
6,Florida,W10,Central Florida,W07,0.284229,0.0,0.334396
27,Buffalo,Z13,Tennessee,Z04,0.267457,0.0,0.311233
19,Arizona,Y04,Nevada-Las Vegas,Y13,0.769377,1.0,0.262174


In [12]:
day_2.log_loss.mean()

0.40616105481090203

In [13]:
(day_1.log_loss.mean() + day_2.log_loss.mean()) / 2

0.3948035427412018

### Second Round

In [14]:
day_3_matchups = [['X01', 'X08'],
                  ['X02', 'X07'],
                  ['X04', 'X12'],
                  ['Y01', 'Y08'],
                  ['Y02', 'Y10'],
                  ['Y03', 'Y06'],
                  ['Z02', 'Z10'],
                  ['Z01', 'Z09']]

day_4_matchups = [['W01', 'W09'],
                  ['W02', 'W07'],
                  ['W03', 'W11'],
                  ['W04', 'W05'],
                  ['X03', 'X06'],
                  ['Y04', 'Y05'],
                  ['Z04', 'Z12'],
                  ['Z03', 'Z11']]

dfs = []
for matchup in day_3_matchups + day_4_matchups: 
    up_seed, down_seed = matchup
    matchup_cond1 = (pdf.ASeed.str.contains(up_seed)) & (pdf.BSeed.str.contains(down_seed))
    matchup_cond2 = (pdf.BSeed.str.contains(up_seed)) & (pdf.ASeed.str.contains(down_seed))
    dfs.append(pdf[(matchup_cond1) | (matchup_cond2)])
second_round = pd.concat(dfs)[['ATeam', 'ASeed', 'BTeam', 'BSeed', 'Pred']].reset_index(drop=True)
second_round['Res'] = np.nan
second_round

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res
0,Kansas,X08,Stanford,X01,1e-05,
1,Texas,X02,Utah,X07,0.83581,
2,Florida Gulf Coast,X12,Maryland,X04,0.28311,
3,Miami (FL),Y08,South Carolina,Y01,1e-05,
4,Creighton,Y10,Iowa,Y02,0.307062,
5,Georgia,Y06,Iowa State,Y03,0.337931,
6,Baylor,Z02,South Dakota,Z10,0.708143,
7,Gonzaga,Z09,Louisville,Z01,1e-05,
8,Kansas State,W09,North Carolina State,W01,1e-05,
9,Connecticut,W02,Central Florida,W07,0.772375,


In [15]:
day_3 = second_round.head(8)
day_3.loc[[4, 1], 'Res'] = 1
day_3.loc[[0, 2, 3, 5, 6, 7], 'Res'] = 0
day_3 = day_3[~(day_3['Res'].isna())]
day_3['log_loss'] = day_3.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_3.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
6,Baylor,Z02,South Dakota,Z10,0.708143,0.0,1.231491
4,Creighton,Y10,Iowa,Y02,0.307062,1.0,1.180707
5,Georgia,Y06,Iowa State,Y03,0.337931,0.0,0.412385
2,Florida Gulf Coast,X12,Maryland,X04,0.28311,0.0,0.332833
1,Texas,X02,Utah,X07,0.83581,1.0,0.179354
0,Kansas,X08,Stanford,X01,1e-05,0.0,1e-05
3,Miami (FL),Y08,South Carolina,Y01,1e-05,0.0,1e-05
7,Gonzaga,Z09,Louisville,Z01,1e-05,0.0,1e-05


In [16]:
day_3.log_loss.mean()

0.4170999803828183

In [17]:
day_4 = second_round.tail(8)
day_4.loc[[9, 10, 15, 11], 'Res'] = 1
day_4.loc[[8, 14, 12, 13], 'Res'] = 0
day_4 = day_4[~(day_4['Res'].isna())]
day_4['log_loss'] = day_4.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_4.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
12,Louisiana State,X03,Ohio State,X06,0.515688,0.0,0.725026
10,Indiana,W03,Princeton,W11,0.590322,1.0,0.527087
13,Arizona,Y04,North Carolina,Y05,0.398588,0.0,0.508474
11,Notre Dame,W05,Oklahoma,W04,0.624287,1.0,0.471145
14,Belmont,Z12,Tennessee,Z04,0.294378,0.0,0.348675
15,Michigan,Z03,Villanova,Z11,0.712489,1.0,0.338991
9,Connecticut,W02,Central Florida,W07,0.772375,1.0,0.258285
8,Kansas State,W09,North Carolina State,W01,1e-05,0.0,1e-05


In [18]:
day_4.log_loss.mean()

0.39721173269950366

### Sweet Sixteen

In [19]:
day_5_matchups = [['X02', 'X06'],
                  ['Y01', 'Y05'],
                  ['X01', 'X04'],
                  ['Y03', 'Y10']]

day_6_matchups = [['W01', 'W05'],
                  ['W02', 'W03'],
                  ['Z01', 'Z04'],
                  ['Z03', 'Z10']]

dfs = []
for matchup in day_5_matchups + day_6_matchups: 
    up_seed, down_seed = matchup
    matchup_cond1 = (pdf.ASeed.str.contains(up_seed)) & (pdf.BSeed.str.contains(down_seed))
    matchup_cond2 = (pdf.BSeed.str.contains(up_seed)) & (pdf.ASeed.str.contains(down_seed))
    dfs.append(pdf[(matchup_cond1) | (matchup_cond2)])
third_round = pd.concat(dfs)[['ATeam', 'ASeed', 'BTeam', 'BSeed', 'Pred']].reset_index(drop=True)
third_round['Res'] = np.nan
third_round

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res
0,Ohio State,X06,Texas,X02,0.277677,
1,North Carolina,Y05,South Carolina,Y01,0.131293,
2,Maryland,X04,Stanford,X01,0.339269,
3,Creighton,Y10,Iowa State,Y03,0.367453,
4,North Carolina State,W01,Notre Dame,W05,0.839447,
5,Connecticut,W02,Indiana,W03,0.675596,
6,Louisville,Z01,Tennessee,Z04,0.754316,
7,Michigan,Z03,South Dakota,Z10,0.595846,


In [20]:
day_5 = third_round.head(4)
day_5.loc[[3], 'Res'] = 1
day_5.loc[[0, 1, 2], 'Res'] = 0
day_5 = day_5[~(day_5['Res'].isna())]
day_5['log_loss'] = day_5.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_5.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
3,Creighton,Y10,Iowa State,Y03,0.367453,1.0,1.00116
2,Maryland,X04,Stanford,X01,0.339269,0.0,0.414408
0,Ohio State,X06,Texas,X02,0.277677,0.0,0.325283
1,North Carolina,Y05,South Carolina,Y01,0.131293,0.0,0.14075


In [21]:
day_6 = third_round.tail(4)
day_6.loc[[4, 5, 6, 7], 'Res'] = 1
# day_6.loc[[], 'Res'] = 0
day_6 = day_6[~(day_6['Res'].isna())]
day_6['log_loss'] = day_6.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_6.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
7,Michigan,Z03,South Dakota,Z10,0.595846,1.0,0.517773
5,Connecticut,W02,Indiana,W03,0.675596,1.0,0.39216
6,Louisville,Z01,Tennessee,Z04,0.754316,1.0,0.281944
4,North Carolina State,W01,Notre Dame,W05,0.839447,1.0,0.175012


### Elite Eight


In [22]:
day_7_matchups = [['Y01', 'Y10'],
                  ['X01', 'X02']]

day_8_matchups = [['W01', 'W02'],
                  ['Z01', 'Z03']]

dfs = []
for matchup in day_7_matchups + day_8_matchups: 
    up_seed, down_seed = matchup
    matchup_cond1 = (pdf.ASeed.str.contains(up_seed)) & (pdf.BSeed.str.contains(down_seed))
    matchup_cond2 = (pdf.BSeed.str.contains(up_seed)) & (pdf.ASeed.str.contains(down_seed))
    dfs.append(pdf[(matchup_cond1) | (matchup_cond2)])
fourth_round = pd.concat(dfs)[['ATeam', 'ASeed', 'BTeam', 'BSeed', 'Pred']].reset_index(drop=True)
fourth_round['Res'] = np.nan
fourth_round

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res
0,Creighton,Y10,South Carolina,Y01,0.035925,
1,Stanford,X01,Texas,X02,0.611337,
2,Connecticut,W02,North Carolina State,W01,0.392467,
3,Louisville,Z01,Michigan,Z03,0.731998,


In [23]:
day_7 = fourth_round.head(2)
day_7.loc[[1], 'Res'] = 1
day_7.loc[[0], 'Res'] = 0
day_7 = day_7[~(day_7['Res'].isna())]
day_7['log_loss'] = day_7.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_7.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
1,Stanford,X01,Texas,X02,0.611337,1.0,0.492107
0,Creighton,Y10,South Carolina,Y01,0.035925,0.0,0.036586


In [24]:
day_8 = fourth_round.tail(2)
day_8.loc[[2, 3], 'Res'] = 1
# day_8.loc[[], 'Res'] = 0
day_8 = day_8[~(day_8['Res'].isna())]
day_8['log_loss'] = day_8.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_8.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
2,Connecticut,W02,North Carolina State,W01,0.392467,1.0,0.935303
3,Louisville,Z01,Michigan,Z03,0.731998,1.0,0.311977


### Final Four

In [25]:
day_9_matchups = [['Y01', 'Z01'],
                  ['W02', 'X01']]

dfs = []
for matchup in day_9_matchups: 
    up_seed, down_seed = matchup
    matchup_cond1 = (pdf.ASeed.str.contains(up_seed)) & (pdf.BSeed.str.contains(down_seed))
    matchup_cond2 = (pdf.BSeed.str.contains(up_seed)) & (pdf.ASeed.str.contains(down_seed))
    dfs.append(pdf[(matchup_cond1) | (matchup_cond2)])
semi = pd.concat(dfs)[['ATeam', 'ASeed', 'BTeam', 'BSeed', 'Pred']].reset_index(drop=True)
semi['Res'] = np.nan
semi

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res
0,Louisville,Z01,South Carolina,Y01,0.375977,
1,Connecticut,W02,Stanford,X01,0.389444,


In [26]:
day_9 = semi.copy()
day_9.loc[[1], 'Res'] = 1
day_9.loc[[0], 'Res'] = 0
day_9 = day_9[~(day_9['Res'].isna())]
day_9['log_loss'] = day_9.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_9.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
1,Connecticut,W02,Stanford,X01,0.389444,1.0,0.943036
0,Louisville,Z01,South Carolina,Y01,0.375977,0.0,0.471569


### Final

In [27]:
day_10_matchups = [['W02', 'Y01']]

dfs = []
for matchup in day_10_matchups: 
    up_seed, down_seed = matchup
    matchup_cond1 = (pdf.ASeed.str.contains(up_seed)) & (pdf.BSeed.str.contains(down_seed))
    matchup_cond2 = (pdf.BSeed.str.contains(up_seed)) & (pdf.ASeed.str.contains(down_seed))
    dfs.append(pdf[(matchup_cond1) | (matchup_cond2)])
final = pd.concat(dfs)[['ATeam', 'ASeed', 'BTeam', 'BSeed', 'Pred']].reset_index(drop=True)
final['Res'] = np.nan
final

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res
0,Connecticut,W02,South Carolina,Y01,0.364375,


In [28]:
day_10 = final.copy()
day_10.loc[[0], 'Res'] = 0
# day_10.loc[[], 'Res'] = 0
day_10 = day_10[~(day_10['Res'].isna())]
day_10['log_loss'] = day_10.apply(lambda x: log_loss([x.Res], [x.Pred], labels=[0,1]), axis=1)
day_10.sort_values('log_loss', ascending=False)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
0,Connecticut,W02,South Carolina,Y01,0.364375,0.0,0.453146


## Games with the biggest penalty in log loss

In [30]:
temp = pd.concat([day_1, day_2, day_3, day_4, day_5, day_6, day_7, day_8, day_9, day_10])
temp.sort_values('log_loss', ascending=False).head(12)

Unnamed: 0,ATeam,ASeed,BTeam,BSeed,Pred,Res,log_loss
29,Brigham Young,Z06,Villanova,Z11,0.773373,0.0,1.484452
28,Belmont,Z12,Oregon,Z05,0.271256,1.0,1.304692
6,Baylor,Z02,South Dakota,Z10,0.708143,0.0,1.231491
4,Creighton,Y10,Iowa,Y02,0.307062,1.0,1.180707
15,Georgia Tech,X09,Kansas,X08,0.673849,0.0,1.120394
3,Creighton,Y10,Iowa State,Y03,0.367453,1.0,1.00116
1,Connecticut,W02,Stanford,X01,0.389444,1.0,0.943036
2,Connecticut,W02,North Carolina State,W01,0.392467,1.0,0.935303
12,Florida Gulf Coast,X12,Virginia Tech,X05,0.400765,1.0,0.91438
23,Miami (FL),Y08,South Florida,Y09,0.424494,1.0,0.856857


## Overall mean log loss

In [31]:
temp.log_loss.mean()

0.4133400084989658