In [1]:
import pandas as pd
import numpy as np
df = pd.concat([pd.read_csv('DataFiles/RegularSeasonCompactResults.csv'),
               pd.read_csv('DataFiles/NCAATourneyCompactResults.csv')]).reset_index(drop=True)
df = df.sort_values(by=['Season','DayNum'])

In [13]:
def forecast(df,K,HCA,R,return_prob=False):
    start = time.time()
    prob_all = np.zeros(len(df))
    WElo = np.zeros(len(df))
    LElo = np.zeros(len(df))
    
    team_elo = {t:1300 for t in df['WTeamID'].unique()}
    season = 1985
    for i,game in df.iterrows():
        
        if game['Season'] > season:
            for team,elo in team_elo.items():
                if team_elo[team] != 1300:
                    team_elo[team] = 1505*R + elo*(1-R)
            season = game['Season']
        
        locW = game['WLoc']
        if locW == 'H':
            value = HCA
        elif locW == 'A':
            value = -HCA
        else:
            value = 0
        
        elo_diff = team_elo[game['WTeamID']]-team_elo[game['LTeamID']]+value
        probW = 1 / (10**(-elo_diff/400) + 1)  
        
        MOV = game['WScore']=game['LScore']
    
        mult = (MOV+3)**0.8 / (7.5 + 0.006*elo_diff)
    
        shift = K*mult*(1-probW)
        
        WElo[i] = team_elo[game['WTeamID']]
        LElo[i] = team_elo[game['LTeamID']]
        
        team_elo[game['WTeamID']] = team_elo[game['WTeamID']] + shift
        team_elo[game['LTeamID']] = team_elo[game['LTeamID']] - shift
        
        prob_all[i] = probW
        
        
    idx = df[df['Season'] >= 2003].iloc[0].name
    prob_set = prob_all[idx:]
    score = (prob_set > 0.5).sum()/len(prob_set)
    sumprob = prob_set.sum()/len(prob_set)
    logloss = -np.log(prob_set).mean()
    
    end = time.time()
    
    dt = end-start
        
    if return_prob:
        return prob_all, WElo, LElo
    else:
        result = {'K':K,'HCA':HCA,'R':R,'score':score,'sumprob':sumprob,'logloss':logloss}
        print('K=%d,HCA=%d,R=1/%d...score=%0.3f,sumprob=%0.3f...Elapsed Time: %0.1f sec' % \
              (K,HCA,1/R,score,sumprob,dt))
        return result

In [14]:
import time
import multiprocessing as mp

k = [5,10,20,30]
hca=[50,100,150]
r=[1/20,1/10,1/8,1/6]

start = time.time()
pool = mp.Pool(processes=4)
results = [pool.apply_async(forecast, args = (df,K,HCA,R)) \
           for K in k for HCA in hca for R in r]
output = [p.get() for p in results]

end = time.time()

t = (end-start)/60
print('Elapsed time: %0.1f min' % t)

K=5,HCA=50,R=1/8...score=0.722,sumprob=0.600...Elapsed Time: 50.5 sec
K=5,HCA=50,R=1/20...score=0.720,sumprob=0.609...Elapsed Time: 50.6 sec
K=5,HCA=50,R=1/10...score=0.722,sumprob=0.603...Elapsed Time: 50.7 sec
K=5,HCA=50,R=1/6...score=0.723,sumprob=0.595...Elapsed Time: 50.5 sec
K=5,HCA=100,R=1/20...score=0.722,sumprob=0.622...Elapsed Time: 33.6 sec
K=5,HCA=100,R=1/10...score=0.722,sumprob=0.615...Elapsed Time: 33.8 sec
K=5,HCA=100,R=1/6...score=0.720,sumprob=0.609...Elapsed Time: 33.7 sec
K=5,HCA=100,R=1/8...score=0.721,sumprob=0.613...Elapsed Time: 34.0 sec
K=5,HCA=150,R=1/10...score=0.710,sumprob=0.627...Elapsed Time: 27.5 sec
K=5,HCA=150,R=1/20...score=0.714,sumprob=0.632...Elapsed Time: 27.9 sec
K=5,HCA=150,R=1/8...score=0.709,sumprob=0.624...Elapsed Time: 27.7 sec
K=5,HCA=150,R=1/6...score=0.707,sumprob=0.621...Elapsed Time: 27.8 sec
K=10,HCA=50,R=1/20...score=0.722,sumprob=0.624...Elapsed Time: 27.8 sec
K=10,HCA=50,R=1/8...score=0.723,sumprob=0.617...Elapsed Time: 27.8 sec
K=1

Process ForkPoolWorker-8:
Process ForkPoolWorker-7:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/anaconda3/lib/python3.6/m

In [16]:
elo_df = pd.DataFrame(output)

R
0.050000    0.550130
0.100000    0.548794
0.125000    0.549596
0.166667    0.551052
Name: logloss, dtype: float64

In [17]:
best = elo_df.loc[elo_df['logloss'].idxmin()]

In [18]:
best.head()

HCA        100.000000
K           10.000000
R            0.050000
logloss      0.531333
score        0.728667
Name: 16, dtype: float64

In [19]:
df['Elo_Prob'], df['WElo'], df['LElo'] = forecast(df,best['K'],best['HCA'],best['R'],return_prob=True)

In [20]:
slots = pd.read_csv('DataFiles/NCAATourneySlots.csv')
seeds = pd.read_csv('DataFiles/NCAATourneySeeds.csv')

In [24]:
submit = []
for s in range(2014,2019):
    stats = []
    subset = df[(df['Season'] == s) & (df['DayNum'] <= 133)]
    for t in seeds[seeds['Season'] == s]['TeamID']:
        row = {'TeamID':t}
        team_stats = subset[(subset['WTeamID'] == t) | (subset['LTeamID'] == t)]
        team_stats = team_stats.iloc[0]
        if team_stats['WTeamID'] == t:
            row['EloDiff'] = team_stats['WElo'] - team_stats['LElo']
        else:
            row['EloDiff'] = team_stats['LElo'] - team_stats['WElo']
        stats.append(row)
    stats = pd.DataFrame(stats).sort_values('TeamID').set_index('TeamID')
    
    pairings = []
    for i in range(len(stats)):
        teamA = stats.iloc[i]
        for j in range(i+1,len(stats)):
            teamB = stats.iloc[j]
            x = teamA - teamB
            x['TeamA'] = teamA.name
            x['TeamB'] = teamB.name
            pairings.append(x)
    
    pairings = pd.DataFrame(pairings)
    
    pairings['Pred'] = pairings['EloDiff'].apply(lambda x: 1 / (10**(-x/400) + 1))
    
    pairings['ID'] = pairings.apply(lambda row: '%d_%d_%d' % (s,row['TeamA'],row['TeamB']),axis=1)
    
    submit.append(pairings[['ID','Pred']])
    print('Done with season %d' % s)

submit = pd.concat(submit).set_index('ID')

Done with season 2014
Done with season 2015
Done with season 2016
Done with season 2017
Done with season 2018


In [28]:
filename = 'submission_elo.csv'
submit.to_csv(filename)

message = 'Elo scores only, with K=%d R=1/%d HCA=%d' % (best['K'],1/best['R'],best['HCA'])

In [33]:
import os
os.system('kaggle competitions submit -c mens-machine-learning-competition-2019 -f %s -m "%s"' % (filename,message))
print('Submission Complete!')

Submission Complete!
