In [15]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [16]:
df = pd.read_csv(r'C:\Users\tg4923\Documents\GitHub\March-Madness\data\2010\Events_2010.csv')

In [17]:
#df = df.loc[0:10000,:]

In [18]:
# Make a game ID to Aggregate
df['GameID'] = df['DayNum'].map(str) +"-"+ df['WTeamID'].map(str) +"-"+ df['LTeamID'].map(str)

In [19]:
df.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WPoints,LPoints,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,GameID
0,1,2010,7,1143,1293,0,0,0,1143,600578,sub_in,7-1143-1293
1,2,2010,7,1143,1293,0,0,0,1143,600584,sub_in,7-1143-1293
2,3,2010,7,1143,1293,0,0,0,1143,600585,sub_in,7-1143-1293
3,4,2010,7,1143,1293,0,0,10,1143,600581,miss2_lay,7-1143-1293
4,5,2010,7,1143,1293,0,0,10,1143,600581,reb_off,7-1143-1293


In [20]:
df['Event_Ones'] = 1

In [21]:
team_game_stats = pd.pivot_table(df,columns=['EventType'], aggfunc = np.sum, values = 'Event_Ones',index = ['GameID','EventTeamID','WTeamID'])

In [22]:
team_game_stats.reset_index(inplace = True)

In [23]:
team_game_stats.fillna(inplace=True,value=0)

From a high level, we are looking to compare two teams at a time to determine whether or not they would win a majority of the time against each other. We want to find the most likely outcome from the tournament. 

Lets start with the assumption that we have 65 teams in the tournement. Each team has a 50% chance of winning against the other team, and there for each team has a 1 in 65 chance of winning the tournement. Obviously this is not true, the 65th team will have a much lower chance of winning the tournement over the top team. But in reality, when the top team and the 65th best team start the game, the probablility that either team wins is exaclty 50% (let's call this, oh I don't know, a "prior probability") 


For each statistical category, each team is a sample from the population that is all the teams in that category. We can keep it simple and have a normal distribution of stats accross all games in that season. (Factor in competition) 

In [24]:
winner = lambda x,y: 1 if x == y else 0
team_game_stats['Winner']=team_game_stats.apply(lambda row: winner(row['EventTeamID'],row['WTeamID']), axis=1)

In [25]:
team_game_stats.drop(['GameID','EventTeamID','WTeamID'],inplace=True,axis=1)

In [26]:
team_game_stats

EventType,assist,block,foul_pers,foul_tech,made1_free,made2_dunk,made2_jump,made2_lay,made2_tip,made3_jump,...,reb_dead,reb_def,reb_off,steal,sub_in,sub_out,timeout,timeout_tv,turnover,Winner
0,13.0,1.0,23.0,0.0,6.0,0.0,6.0,13.0,0.0,3.0,...,2.0,23.0,14.0,5.0,47.0,47.0,3.0,4.0,12.0,0
1,12.0,4.0,17.0,0.0,26.0,0.0,4.0,12.0,1.0,3.0,...,3.0,27.0,12.0,8.0,30.0,31.0,2.0,3.0,9.0,1
2,6.0,1.0,14.0,0.0,8.0,0.0,5.0,7.0,0.0,4.0,...,2.0,27.0,7.0,3.0,18.0,18.0,3.0,4.0,16.0,0
3,13.0,4.0,14.0,0.0,8.0,3.0,6.0,17.0,0.0,4.0,...,1.0,31.0,12.0,11.0,15.0,15.0,0.0,3.0,9.0,1
4,9.0,7.0,17.0,0.0,23.0,3.0,7.0,9.0,1.0,8.0,...,4.0,28.0,12.0,13.0,29.0,29.0,2.0,5.0,8.0,1
5,14.0,3.0,22.0,0.0,4.0,1.0,9.0,5.0,0.0,6.0,...,4.0,22.0,7.0,5.0,34.0,34.0,2.0,3.0,17.0,0
6,12.0,2.0,22.0,1.0,18.0,0.0,10.0,8.0,2.0,8.0,...,6.0,23.0,16.0,9.0,26.0,26.0,1.0,5.0,11.0,1
7,9.0,4.0,21.0,0.0,20.0,1.0,7.0,5.0,1.0,5.0,...,1.0,24.0,12.0,4.0,40.0,40.0,4.0,3.0,19.0,0
8,6.0,9.0,18.0,1.0,20.0,2.0,8.0,3.0,0.0,3.0,...,2.0,32.0,7.0,1.0,21.0,21.0,4.0,1.0,12.0,1
9,11.0,2.0,21.0,0.0,13.0,2.0,9.0,3.0,0.0,4.0,...,2.0,19.0,9.0,3.0,41.0,41.0,4.0,7.0,3.0,0


In [28]:
y = team_game_stats.pop('Winner')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(team_game_stats,y,test_size=0.33,random_state=42)

In [30]:
std = StandardScaler()

In [56]:
def model_wrap(X,y,X_test,y_test,standardize_obj=std,a=0.0001):
    X = standardize_obj.fit_transform(X)
    print(X)
    ridge = Ridge(alpha=a)
    ridge.fit(X,y)
    y_pred = ridge.predict(standardize_obj.fit_transform(X_test))
    print(roc_auc_score(y_test,y_pred))

In [57]:
model_wrap(X_train,y_train,X_test,y_test)

[[ 0.24602413 -0.19376048  0.30231803 ...,  0.3523663   0.19741392
   2.19931223]
 [ 1.84176624 -0.19376048 -0.35525637 ..., -0.27448792  0.19741392
  -1.82744638]
 [-1.34971797  0.22280089  0.5215095  ...,  0.3523663   0.79032916
  -0.64310561]
 ..., 
 [-0.89379166  0.63936225 -0.1360649  ...,  0.3523663  -1.58133179
  -0.87997377]
 [ 0.70195045 -1.44344457  1.8366583  ...,  0.3523663   0.79032916
  -0.64310561]
 [-1.12175481 -0.19376048  0.95989243 ..., -1.52819634  2.56907487
  -0.16936931]]
0.958472472129
