In [29]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score,cross_validate,train_test_split

In [2]:
def ou_accuracy(actu,pred):
    global dfall
    ou = dfall.loc[actu.index,'Over/Under']
    if not((len(actu) == len(pred)) & (len(pred) == len(ou))):
        if (len(actu) != len(pred)):
            raise ValueError(f'Length mismatch among arguments: actual != predicted ({len(actu)} != {len(pred)})')
        else:
            raise ValueError(f'Length mismatch among arguments: predicted != Over/Under ({len(pred)} != {len(ou)})')
    else:
        corr = (((actu<ou) & (pred<ou)).sum() + ((actu>ou) & (pred>ou)).sum())
        tot = (actu<ou).sum() + (actu>ou).sum()
        return corr/tot

In [3]:
def regress(reg,Xtrain,Xtest,ytrain,ytest,cv=5,print_metrics=True,print_coef=True,return_metrics=False):
    robust = RobustScaler()
    sca_train = robust.fit_transform(Xtrain)
    sca_test = robust.transform(Xtest)
    val = cross_validate(reg, sca_train, ytrain, cv=cv, return_estimator=True,
                                scoring={'mae':make_scorer(mean_absolute_error,greater_is_better=False),
                                        'acc':make_scorer(ou_accuracy,greater_is_better=True)})
    acc = (np.mean(val['test_acc']),np.std(val['test_acc']))
    mae = (np.mean(-val['test_mae']),np.std(-val['test_mae']))
    best_estimator = val['estimator'][val['test_mae'].argmax()]

    ypred=best_estimator.predict(sca_test)
    mae_test=mean_absolute_error(ytest,ypred)
    acc_test = ou_accuracy(ytest,ypred)
    sorted_coef = get_coef(best_estimator.coef_,Xtrain.columns)
    if print_metrics:
        print(f'MAE_train = {mae[0]:.3f}±{mae[1]:.3f}, Accuracy = {acc[0]:.3f}±{acc[1]:.3f}')
        print(-val['test_mae'])
#        print(f'MAE_test = {mae_test:.3f}, Accuracy_test = {acc_test:.3f}')
    if print_coef:
        print(sorted_coef)
    if return_metrics:
        return mae_test,acc_test
    else:
        return

In [4]:
def get_coef(coef_,cols):
    coef = pd.DataFrame(coef_, cols, columns=['LR Coefficient'])
    ind=np.flip(np.argsort(abs(coef_)))
    return coef.iloc[ind]

In [23]:
path = 'data/df_complete.pkl'
with open(path,'rb') as f:
    dfall = pickle.load(f)
turf=['a_turf','astroplay','fieldturf','sportturf','matrixturf']
LR = linear_model.LinearRegression()
robust = RobustScaler()

In [25]:
vifdrop=['H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric',
         'H_Pts','V_Pts','H_Pts_Opp','V_Pts_Opp','H_Poss','V_Poss','dome','H_Plays','V_Plays']

In [26]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Use training ratio of 70% for VIF analysis
train_ratio = 0.70

path = 'data/df_working5.pkl'
with open(path,'rb') as f:
    dfgame = pickle.load(f)

X = dfgame.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
X = X.drop(turf,axis=1)
X = X.drop(vifdrop,axis=1)
y = dfgame['Tot_Pts']

In [30]:
# W/out Pass/Rush Metrics, Scores, Possession, Plays, dome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)

vif = pd.DataFrame()
vif['Features'] = X_train.columns
xarray = sm.add_constant(X_train.values)
vif['VIF Factor'] = [variance_inflation_factor(xarray,i) for i in range(1,len(X_train.columns)+1)]
vif.round(2).sort_values(by='VIF Factor',ascending=False)

Unnamed: 0,Features,VIF Factor
31,Wind,1.42
0,H_Off_Pass,1.41
1,V_Off_Pass,1.41
30,Temperature,1.4
3,V_Off_Rush,1.38
2,H_Off_Rush,1.35
32,Humidity,1.34
13,V_RZ_Pct,1.3
12,H_RZ_Pct,1.29
5,V_Def_Pass,1.28


In [31]:
LR = linear_model.LinearRegression()
LR.fit(X_train,y_train)
ypred=LR.predict(X_train)
print(f'MAE = {mean_absolute_error(y_train,ypred)}')
print(f'Intercept = {LR.intercept_}')
get_coef(LR.coef_,X_train.columns)

MAE = 10.498044582324255
Intercept = -14.09805569896863


Unnamed: 0,LR Coefficient
V_FG_Pct,8.333853
V_Def_RZ_Pct,6.351779
H_TD_on_Def,-3.447254
V_TD_on_Def,-2.430034
H_Def_RZ_Pct,-1.987236
grass,-1.760693
H_TO_Gain,1.346844
H_RZ_Pct,1.230331
V_TO_Gain,1.120137
V_Sacks_Def,1.00234


In [32]:
ou_accuracy(y_train,ypred)

0.5396825396825397

In [34]:
vifdrop=['H_Off_Pass','V_Off_Pass','H_Off_Rush','V_Off_Rush','H_Def_Pass','V_Def_Pass','H_Def_Rush',
         'V_Def_Rush','H_Pts','V_Pts','H_Pts_Opp','V_Pts_Opp','dome','V_Poss','H_Poss']

In [35]:
train_ratio = 0.70
path = 'data/df_working5.pkl'
with open(path,'rb') as f:
    dfgame = pickle.load(f)
X = dfgame.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
X = X.drop(turf,axis=1)
X = X.drop(vifdrop,axis=1)
y = dfgame['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
vif = pd.DataFrame()
vif['Features'] = X_train.columns
xarray = sm.add_constant(X_train.values)
vif['VIF Factor'] = [variance_inflation_factor(xarray,i) for i in range(1,len(X_train.columns)+1)]
vif.round(2).sort_values(by='VIF Factor',ascending=False)

Unnamed: 0,Features,VIF Factor
0,H_Pass_Metric,1.47
1,V_Pass_Metric,1.47
29,Wind,1.41
28,Temperature,1.4
12,H_Plays,1.35
30,Humidity,1.33
3,V_Rush_Metric,1.32
13,V_Plays,1.28
2,H_Rush_Metric,1.26
21,V_Sacks_Def,1.23


In [14]:
train_ratio = 0.85
mae = np.empty(7)
acc = np.empty(7)
ou = np.empty(7)
acc_ext = np.empty(7)
for n in range(7):
    path = 'data/df_working'+str(n+1)+'.pkl'
    with open(path,'rb') as f:
        dfgame = pickle.load(f)
    yy = dfgame['Tot_Pts']
    XX = dfgame.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
    XX = XX.drop(turf,axis=1)
    XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size = 1-train_ratio, random_state=77)
    m, a = regress(LR,XX_train,XX_test,yy_train,yy_test,
                   print_metrics=False,print_coef=False,return_metrics=True)
    ou[n] = mean_absolute_error(yy_test,dfgame.loc[XX_test.index,'Over/Under'])
    mae[n] = m
    acc[n] = a
print(mae)
print(acc)
print(ou-mae)

[11.34933801 11.16744056 10.48705836 10.59635947 11.09379895 10.69871393
 10.22553897]
[0.45892351 0.53012048 0.50645161 0.51590106 0.54509804 0.55339806
 0.51442308]
[-0.83544912 -0.44422627 -0.28513529 -0.24095878 -0.50567634 -0.25767619
 -0.40714275]


In [None]:
fig = plt.figure()
_ = fig.add_subplot(3,1,1)
_ = plt.xlabel('# Games Averaged')
_ = plt.ylabel('RMSE Diff')
_ = plt.ylim(-0.8,0)
#_ = plt.plot(np.arange(1,len(rmse)+1),rmse,color='blue')
#_ = plt.plot(np.arange(1,len(rmse)+1),bench_rmse,color='red')
_ = plt.plot(np.arange(1,len(rmse)+1),bench_rmse-rmse,color='green')
_ = fig8.add_subplot(3,1,2)
#_ = plt.xlabel('# Games Averaged')
_ = plt.ylabel('Accuracy')
_ = plt.ylim(0.49,0.60)
_ = plt.plot(np.arange(1,len(acc)+1),acc,color='blue')
_ = plt.plot(np.arange(1,len(acc)+1),acc_ext,color='red')
_ = fig8.add_subplot(3,1,3)
_ = plt.ylabel('RMSE Diff')
_ = plt.plot(np.arange(1,len(acc)+1),bench_rmse_ext-rmse_ext,color='lightgreen')
_ = plt.ylim(-1.0,-0.0)
fig.show()
#fig8.savefig('plots/fig8.png')

In [22]:
path = 'data/df_working5.pkl'
with open(path, 'rb') as f:
    dfgame = pickle.load(f)
X = dfgame.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts']
X = X.drop(turf,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
regress(LR,X_train,X_test,y_train,y_test,cv=5)

MAE_train = 10.751±0.407, Accuracy = 0.512±0.027
MAE_test = 11.094, Accuracy_test = 0.545
                LR Coefficient
V_Off_Pass            3.940549
V_Def_Rush            3.666127
H_Rush_Metric         3.410002
H_Pass_Metric        -2.530071
dome                 -2.388048
Wind                 -2.090254
grass                -2.018223
H_Pts                 1.901829
H_Off_Rush           -1.827957
H_Off_Pass            1.612428
V_Off_Rush            1.497181
V_FG_Pct              1.223666
H_Plays               1.138949
V_Def_RZ_Pct          1.115872
Humidity             -1.087653
V_Pass_Metric        -1.076402
V_Sacks_Def           1.071513
V_TO_Gain             0.936713
H_TD_on_Def          -0.851397
Temperature           0.809375
V_Def_Pass           -0.736765
V_Poss               -0.637031
V_Pts_Opp            -0.597592
H_Def_RZ_Pct         -0.591748
V_Kickret             0.589632
H_Kickret            -0.532342
V_TD_on_Def          -0.506268
H_Yds_Pen             0.498000
Week       

In [None]:
factor=np.arange(0.0,0.75,0.05)
mae=np.empty(len(factor))
acc=np.empty(len(factor))
for ind,n in enumerate(factor):
    path = 'data/df_working6_{:.2f}.pkl'.format(n)
    with open(path, 'rb') as f:
        df = pickle.load(f)
    df.dropna(inplace=True)

    X = df.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
    X = X.drop(turf,axis=1)
    X = X.drop(vifdrop,axis=1)
    y = df['Tot_Pts']
    wtx_train,wtx_test,wty_train,wty_test = train_test_split(X,y,test_size = 1-train_ratio,random_state = 77)
    m, a = regress(LR,wtx_train,wtx_test,wty_train,wty_test,
                   print_metrics=True,print_coef=False,return_metrics=True)
    mae[ind]=m
    acc[ind]=a
print(mae)
print(acc)

In [None]:
from sklearn.linear_model import Lasso
# path = 'data/df_working6_sh.pkl'.format(n)
# with open(path, 'rb') as f:
#     df = pickle.load(f)
# X = df.drop(turf,axis=1)
path = 'data/df_working6.pkl'.format(n)
with open(path, 'rb') as f:
    df = pickle.load(f)
X = df.drop(turf,axis=1)
X = X.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = df['Tot_Pts']
lasso = Lasso(alpha=0.01,normalize=True)
lasso.fit(X,y)
lasso_coef = lasso.fit(X,y).coef_
coef=get_coef(lasso_coef,X.columns)
print(coef)

In [None]:
from sklearn.linear_model import Ridge
cols=['V_Def_RZ_Pct','H_Pts','Wind','V_Pts','V_Tackles_Loss','V_Off_Pass','H_Def_Rush','H_Off_Pass',
      'V_Def_Pass','H_Yds_Pen','V_Def_Rush','H_Def_Pass']
path = 'data/df_working6.pkl'.format(n)
with open(path, 'rb') as f:
    df = pickle.load(f)
X = df.drop(turf,axis=1)
X = X[cols]
y = df['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
alphas=np.logspace(-3, 0, 40)
mae = np.empty(40)
acc = np.empty(40)
for ind,alpha in enumerate(alphas):
    ridge=Ridge(alpha=alpha,normalize=True)
    m,a=regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=False,return_metrics=True)
    mae[ind] = m
    acc[ind] = a
print(mae)
print(acc)

In [None]:
ridge=Ridge(alpha=0.15,normalize=True)
regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=True,return_metrics=False)

In [None]:
from sklearn.linear_model import Ridge
cols=['H_Pts', 'V_Pts', 'H_Off_Pass', 'V_Off_Pass', 'H_Def_Rush','V_Def_Rush', 'H_RZ_Pct', 'V_RZ_Pct', 
      'H_Sacks_Def', 'Wind', 'grass','dome']
path = 'data/df_working6.pkl'
with open(path, 'rb') as f:
    df = pickle.load(f)
X = df.drop(turf,axis=1)
X = X[cols]
y = df['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
alphas=np.logspace(-3, 0, 40)
mae = np.empty(40)
acc = np.empty(40)
for ind,alpha in enumerate(alphas):
    ridge=Ridge(alpha=alpha,normalize=True)
    m,a=regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=False,return_metrics=True)
    mae[ind] = m
    acc[ind] = a
print(mae)
print(acc)

In [None]:
# Random Forest
from sklearn import ensemble
import graphviz
X = dfgame.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, 
                                                test_size = test_ratio/(test_ratio + validation_ratio),
                                                random_state=77)
frst_model=ensemble.RandomForestRegressor(random_state=77,n_estimators=500,max_depth=3)
frst_model.fit(X_train,y_train)
y_pred = frst_model.predict(X_test)
estimator = frst_model.estimators_[20]
dot_data=tree.export_graphviz(estimator, filled=True, rounded=True,
                                     special_characters=True,feature_names=X_train.columns)
#graphviz.Source(dot_data)

# get importance
importance = frst_model.feature_importances_
# summarize feature importance
coef = pd.DataFrame(importance, X_train.columns, columns=['Importance'])
ind=np.flip(np.argsort(importance))
coef_sort = coef.iloc[ind]
print(coef_sort)

In [None]:
ridge=Ridge(alpha=0.08,normalize=True)
regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=True,return_metrics=False)