In [1]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.model_selection import KFold

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [31]:
a_df = pd.read_csv('data/merged_games.csv')

In [32]:
a_df

Unnamed: 0,game_id,season,week,home_team,away_team,home_win_game,home_passAttempts,home_passCompletions,home_passPct,home_passGrossYards,...,away_kr20Plus,away_fgMade,away_field_goal_pct,away_punt_inside_20_pct,away_third_down_pct,away_fourth_down_pct,away_penalties,away_team_score,away_opponent_score,away_wins_past_games
0,30373,2014,8,ARI,PHI,1,34.833333,19.833333,56.500000,237.833333,...,1.000000,2.000000,79.166667,40.283333,39.516667,16.666667,7.166667,28.500000,22.000000,5
1,30400,2014,10,ARI,LA,1,36.500000,20.500000,56.350000,255.666667,...,2.166667,0.666667,50.000000,41.383333,42.033333,25.000000,7.500000,19.666667,28.166667,2
2,30415,2014,11,ARI,DET,1,37.500000,22.000000,58.700000,267.666667,...,1.000000,1.333333,58.333333,46.500000,30.300000,19.450000,7.500000,20.166667,16.166667,5
3,51762,2014,14,ARI,KC,1,35.500000,21.500000,60.533333,275.500000,...,2.666667,1.333333,100.000000,40.483333,32.216667,25.000000,4.666667,21.500000,17.166667,4
4,30488,2014,16,ARI,SEA,0,32.833333,19.666667,59.033333,242.166667,...,1.166667,2.166667,96.666667,25.000000,45.850000,16.666667,8.333333,22.833333,11.333333,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,51560,2019,7,WAS,SF,0,34.000000,22.000000,63.550000,227.000000,...,0.500000,1.666667,72.233333,30.550000,44.950000,50.000000,6.333333,29.833333,18.666667,5
1402,51616,2019,11,WAS,NYJ,0,22.333333,14.166667,64.850000,135.500000,...,1.833333,1.000000,40.283333,19.583333,24.716667,41.666667,8.666667,15.833333,27.666667,2
1403,51630,2019,12,WAS,DET,1,23.500000,14.833333,64.966667,147.166667,...,1.500000,1.666667,75.000000,44.450000,44.216667,25.000000,7.833333,24.500000,29.500000,1
1404,51671,2019,15,WAS,PHI,0,26.666667,15.166667,58.083333,165.666667,...,0.833333,1.666667,87.500000,31.116667,41.166667,25.000000,5.833333,21.000000,19.166667,3


In [4]:
from src.remove_correlated_stats import remove_corr_stats
from src.my_predicted_stats import my_pred_stats

In [5]:
df_train = pd.read_csv('data/merged_games.csv')
df_test = pd.read_csv('data/merged_games.csv')
df_train = df_train[df_train['season']<2019].copy()
df_test = df_test[df_test['season']>=2019].copy()
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_train = remove_corr_stats(df_train)
df_test = remove_corr_stats(df_test)

In [6]:
X_train = np.array(df_train.drop(columns=['game_id', 'season', 'week', 'home_team','away_team','home_win_game']))
y_train = np.array(df_train.loc[:,'home_win_game'])
X_test = np.array(df_test.drop(columns=['game_id', 'season', 'week', 'home_team','away_team','home_win_game']))
y_test = np.array(df_test.loc[:,'home_win_game'])

In [7]:
model = GradientBoostingClassifier(learning_rate=0.01,
                                   n_estimators=500,
                                   min_samples_leaf=5,
                                   max_depth=2,
                                   subsample=0.5)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
y_true = y_test
print("Accuracy:", accuracy_score(y_true, y_predict))
print("Precision:", precision_score(y_true, y_predict))
print("Recall:", recall_score(y_true, y_predict))

Accuracy: 0.61328125
Precision: 0.5882352941176471
Recall: 0.8333333333333334


In [8]:
proba_away_team_wins = model.predict_proba(X_test)[:,0]
proba_home_team_wins = model.predict_proba(X_test)[:,1]

In [9]:
df_gambling = df_test.loc[:,['game_id', 'season', 'week', 'home_team','away_team','home_win_game']]
df_gambling['proba_away_team_wins'] = proba_away_team_wins
df_gambling['proba_home_team_wins'] = proba_home_team_wins

In [50]:
threshold = 0.5

In [51]:
df_threshold = df_gambling[(df_gambling['proba_away_team_wins']>threshold) | (df_gambling['proba_home_team_wins']>threshold)].copy()

In [52]:
df_threshold.reset_index(inplace=True, drop=True)

In [53]:
df_threshold['prediction'] = (df_threshold['proba_home_team_wins'] > df_threshold['proba_away_team_wins']).astype(int)

In [54]:
accuracy_score(np.array(df_threshold['home_win_game']), 
               np.array(df_threshold['prediction']))

0.61328125

In [55]:
df_threshold['result'] = (df_threshold['home_win_game'] == df_threshold['prediction']).astype(int)

In [56]:
df_threshold['result'] = df_threshold['result'].replace(0, -1)

In [57]:
betting_lines = pd.read_csv('data/betting_lines.csv')
betting_lines.drop(columns='Unnamed: 0', inplace=True)

In [58]:
summaried_betting_lines = betting_lines.loc[:,['game_id','season','awayLine','homeLine']].copy()
summaried_betting_lines['season']= summaried_betting_lines['season'].map(lambda x: x[0:4]).astype(int)
df_threshold_2019 = df_threshold[df_threshold['season']>2018].copy()


In [59]:
gambling_df = df_threshold_2019.merge(summaried_betting_lines,on='game_id',how='left')

In [60]:
gambling_df['awayLine'] = gambling_df['awayLine'].map(lambda x: 100/-x if x < 0 else x)
gambling_df['homeLine'] = gambling_df['homeLine'].map(lambda x: 100/-x if x < 0 else x)
gambling_df['awayLine'] = gambling_df['awayLine'].map(lambda x: x/100 if x > 1 else x)
gambling_df['homeLine'] = gambling_df['homeLine'].map(lambda x: x/100 if x > 1 else x)

In [61]:
def calc_poss_winnings(bet, line):
    return bet * line

In [62]:
gambling_df['possible_winnings'] = gambling_df.apply(lambda x: (calc_poss_winnings(100,x['homeLine']) if x['prediction']==1 
                                    else calc_poss_winnings(100,x['awayLine'])),axis=1)

In [63]:
gambling_df['wins_or_loss'] = gambling_df.apply(lambda x: x['possible_winnings'] if x['result']==1 
                                else -100,axis=1)

In [64]:
gambling_df['wins_or_loss'].sum()

-813.0695675184552

In [65]:
gambling_df

Unnamed: 0,game_id,season_x,week,home_team,away_team,home_win_game,proba_away_team_wins,proba_home_team_wins,prediction,result,season_y,awayLine,homeLine,possible_winnings,wins_or_loss
0,51473,2019,1,ARI,DET,0,0.638366,0.361634,0,1,2019,0.666667,1.300000,66.666667,66.666667
1,51503,2019,3,ARI,CAR,0,0.627336,0.372664,0,1,2019,0.714286,1.200000,71.428571,71.428571
2,51519,2019,4,ARI,SEA,0,0.599608,0.400392,0,1,2019,0.625000,1.300000,62.500000,62.500000
3,51548,2019,6,ARI,ATL,1,0.608356,0.391644,0,-1,2019,0.769231,1.000000,76.923077,-100.000000
4,51582,2019,9,ARI,SF,0,0.545588,0.454412,0,1,2019,0.222222,3.250000,22.222222,22.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,51560,2019,7,WAS,SF,0,0.762143,0.237857,0,1,2019,0.200000,3.500000,20.000000,20.000000
268,51616,2019,11,WAS,NYJ,0,0.431062,0.568938,1,-1,2019,1.050000,0.800000,80.000000,-100.000000
269,51630,2019,12,WAS,DET,1,0.599902,0.400098,0,-1,2019,0.526316,1.650000,52.631579,-100.000000
270,51671,2019,15,WAS,PHI,0,0.502556,0.497444,0,1,2019,0.444444,1.850000,44.444444,44.444444
