In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import brier_score_loss
from sklearn.model_selection import TimeSeriesSplit

In [2]:
class NCAAELO:

    def __init__(self, df: pd.DataFrame = None, base_k: int = 49, home_adv: int = 105, elo_dict: dict = None,
                 season_adj: int = None): 

        self.df = df
        self.teams = set(pd.concat([self.df['WTeamID'], self.df['LTeamID']]))
        self.winners = self.df.WTeamID
        self.losers = self.df.LTeamID
        self.seasons = self.df.Season
        self.win_loc = self.df.WLoc
        self.base_k = base_k
        self.home_advantage = home_adv
        self.season_adj = season_adj
        self.processed = False
        self.winner_elo = []
        self.loser_elo = []
        self.winner_probs = []
        
        if elo_dict is not None:
            self.elo_dict = elo_dict
        else:
            self.elo_dict = {name: 1500 for name in self.teams}
        
        # adding teams to elo_dict if they aren't included in the provided
        # dictionary
        for team in self.teams:
            if team not in self.elo_dict:
                self.elo_dict[team] = 1500

    def update_elo(self, winner=None, loser=None, win_loc=None):

        if win_loc == 'H':
            adv = self.home_advantage
        elif win_loc == 'A':
            adv = -self.home_advantage
        else:
            adv = 0

        prematch_winner_elo = self.elo_dict[winner] + adv
        prematch_loser_elo = self.elo_dict[loser]

        exp_a = 1 / \
            (1 + 10 ** ((prematch_loser_elo - prematch_winner_elo)/400))
        exp_b = 1 - exp_a

        winner_delta = self.base_k * (1 - exp_a)
        loser_delta = self.base_k * (0 - exp_b)

        self.elo_dict[winner] = self.elo_dict[winner] + winner_delta
        self.elo_dict[loser] = self.elo_dict[loser] + loser_delta

        return exp_a

    def process_elo(self):
        
        if self.processed == True:
            return "already processed"
        
        season_ = min(self.seasons)

        for s, w, l, h in zip(self.seasons, self.winners, self.losers, self.win_loc):

            if s != season_:
                self.elo_dict = {team: ((self.season_adj * value + (1 - self.season_adj) * 1500)) for team, value in self.elo_dict.items()}
                season_ += 1

            winner_prob = self.update_elo(winner=w, loser=l, win_loc=h)

            self.winner_probs.append(winner_prob)
        
        self.processed = True
        
        return self

In [3]:
data_folder = './kaggle_data/'

mens_reg = pd.read_csv(f'{data_folder}MRegularSeasonCompactResults.csv')
tourney_df = pd.read_csv(f'{data_folder}MNCAATourneyCompactResults.csv')
sec_tourn_df = pd.read_csv(f'{data_folder}MSecondaryTourneyCompactResults.csv')

In [5]:
# combining all of the mens games to compute a total elo

elo_columns = ['Season', 'DayNum', 'WTeamID', 'LTeamID', 'WLoc']

elo_games = (
    pd.concat([mens_reg[elo_columns], tourney_df[elo_columns], sec_tourn_df[elo_columns]])
    .sort_values(by=['Season', 'DayNum'], ascending=True)
    .reset_index(drop=True)
)

In [49]:
# manual gridsearch with some print statements to check intution and so on

for i in np.linspace(0.1, 0.9, 25):
    elo_test_ = NCAAELO(elo_games, home_adv=88, base_k=44, season_adj=i).process_elo()
    elo_games[f'{i}_elo_pred'] = elo_test_.winner_probs
    
    print(i,elo_games.groupby('Season').apply(lambda x: brier_score_loss(len(x) * [1], x[f'{i}_elo_pred'])).loc[2015:].mean())

0.1 0.19707007122941222
0.13333333333333333 0.19629245276512874
0.16666666666666669 0.1955376581877026
0.2 0.19480578273986798
0.23333333333333334 0.19409690336557717
0.26666666666666666 0.1934110863463503
0.30000000000000004 0.19274839658856444
0.33333333333333337 0.19210890886628532
0.3666666666666667 0.19149272146057086
0.4 0.19089997282399826
0.43333333333333335 0.19033086215984843
0.4666666666666667 0.18978567517227282
0.5 0.18926481676941462
0.5333333333333333 0.18876885326975898
0.5666666666666667 0.18829856780930337
0.6 0.18785503440007295
0.6333333333333333 0.18743971883546462
0.6666666666666666 0.18705461905384063
0.7 0.1867024648935215
0.7333333333333333 0.18638700973740316
0.7666666666666666 0.186113468993489
0.7999999999999999 0.18588920243963758
0.8333333333333333 0.18572482117397027
0.8666666666666666 0.1856360795292152
0.9 0.18564733800433492


In [28]:
out_of_fold_predictions = []

for i in range(10):
    preds = []
    ts_split_ = TimeSeriesSplit(n_splits=8)
    
    for train_i, val_i in ts_split_.split(elo_df):
        
        train_instance_ = NCAAELO(elo_df.iloc[train_i]).process_elo()
        
        val_instance_ = NCAAELO(elo_df.iloc[val_i], elo_dict=train_instance_.elo_dict).process_elo()
    
        oof_pred = val_instance_.winner_probs
        preds.append([val_i, oof_pred])
    
    out_of_fold_predictions.append(preds)