In [2]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score,cross_validate,train_test_split

In [42]:
def ou_accuracy(actu,pred):
    global dfall
    ou = dfall.loc[actu.index,'Over_Under']
    if not((len(actu) == len(pred)) & (len(pred) == len(ou))):
        if (len(actu) != len(pred)):
            raise ValueError(f'Length mismatch among arguments: actual != predicted ({len(actu)} != {len(pred)})')
        else:
            raise ValueError(f'Length mismatch among arguments: predicted != Over/Under ({len(pred)} != {len(ou)})')
    else:
        corr = (((actu<ou) & (pred<ou)).sum() + ((actu>ou) & (pred>ou)).sum())
        tot = (actu<ou).sum() + (actu>ou).sum()
        return corr/tot

In [4]:
def regress(reg,Xtrain,Xtest,ytrain,ytest,cv=5,print_metrics=True,print_coef=True,return_metrics=False):
    robust = RobustScaler()
    sca_train = robust.fit_transform(Xtrain)
    sca_test = robust.transform(Xtest)
    val = cross_validate(reg, sca_train, ytrain, cv=cv, return_estimator=True,
                                scoring={'mae':make_scorer(mean_absolute_error,greater_is_better=False),
                                        'acc':make_scorer(ou_accuracy,greater_is_better=True)})
    acc = (np.mean(val['test_acc']),np.std(val['test_acc']))
    mae = (np.mean(-val['test_mae']),np.std(-val['test_mae']))
    best_estimator = val['estimator'][val['test_mae'].argmax()]

    ypred=best_estimator.predict(sca_test)
    mae_test=mean_absolute_error(ytest,ypred)
    acc_test = ou_accuracy(ytest,ypred)
    sorted_coef = get_coef(best_estimator.coef_,Xtrain.columns)
    if print_metrics:
        print(f'MAE_train = {mae[0]:.3f}±{mae[1]:.3f}, Accuracy = {acc[0]:.3f}±{acc[1]:.3f}')
        print(-val['test_mae'])
#        print(f'MAE_test = {mae_test:.3f}, Accuracy_test = {acc_test:.3f}')
    if print_coef:
        print(sorted_coef)
    if return_metrics:
        return mae_test,acc_test
    else:
        return

In [5]:
def get_coef(coef_,cols):
    coef = pd.DataFrame(coef_, cols, columns=['LR Coefficient'])
    ind=np.flip(np.argsort(abs(coef_)))
    return coef.iloc[ind]

In [6]:
path = 'data/df_complete.pkl'
with open(path,'rb') as f:
    dfall = pickle.load(f)
turf=['a_turf','astroplay','fieldturf','sportturf','matrixturf']
LR = linear_model.LinearRegression()
robust = RobustScaler()

In [27]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Use training ratio of 70% for VIF analysis
train_ratio = 0.70

path = 'data/df_working5.pkl'
with open(path,'rb') as f:
    dfgame = pickle.load(f)

In [50]:
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
X = X.drop(turf,axis=1)
y = dfgame['Tot_Pts']

vifdrop=['H_TD','V_TD','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric','H_Pts','V_Pts'
        ,'H_Pts_Opp','V_Pts_Opp','Year']
#vifdrop=['H_TD','V_TD','Year','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric',
#         'H_Pts','V_Pts','H_Pts_Opp','V_Pts_Opp']
X = X.drop(vifdrop,axis=1)
# W/out Pass/Rush Metrics, Scores, Possession, Plays, dome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)

vif = pd.DataFrame()
vif['Features'] = X_train.columns
xarray = sm.add_constant(X_train.values)
vif['VIF Factor'] = [variance_inflation_factor(xarray,i) for i in range(1,len(X_train.columns)+1)]
vif.round(2).sort_values(by='VIF Factor',ascending=False)

Unnamed: 0,Features,VIF Factor
16,H_Poss,2.96
17,V_Poss,2.74
18,H_Plays,2.51
19,V_Plays,2.43
0,H_Off_Pass,2.42
1,V_Off_Pass,2.4
37,dome,2.38
2,H_Off_Rush,1.82
35,Wind,1.69
3,V_Off_Rush,1.68


In [26]:
vifdrop=['H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric',
         'H_Pts','V_Pts','H_Pts_Opp','V_Pts_Opp','H_Poss','V_Poss','dome','H_Plays','V_Plays']

from statsmodels.stats.outliers_influence import variance_inflation_factor
# Use training ratio of 70% for VIF analysis
train_ratio = 0.70

path = 'data/df_working5.pkl'
with open(path,'rb') as f:
    dfgame = pickle.load(f)

X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
X = X.drop(turf,axis=1)
X = X.drop(vifdrop,axis=1)
y = dfgame['Tot_Pts']

In [49]:
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
X = X.drop(turf,axis=1)
y = dfgame['Tot_Pts']

vifdrop=['H_TD','V_TD','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric','H_Pts','V_Pts',
        'H_Pts_Opp','V_Pts_Opp','Year']
X = X.drop(vifdrop,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
LR = linear_model.LinearRegression()
LR.fit(X_train,y_train)
ypred=LR.predict(X_train)
print(f'MAE = {mean_absolute_error(y_train,ypred)}')
print(f'Intercept = {LR.intercept_}')
print(f'Training Accuracy = {ou_accuracy(y_train,ypred)}')
get_coef(LR.coef_,X_train.columns)


MAE = 10.469111233948642
Intercept = -9.266819491063806
Training Accuracy = 0.5575959933222037


Unnamed: 0,LR Coefficient
V_Def_RZ_Pct,7.141292
V_FG_Pct,4.610819
H_Def_RZ_Pct,-3.461626
dome,-2.649674
H_TD_on_Def,-2.489433
grass,-2.431341
V_TD_on_Def,-1.913476
V_RZ_Pct,1.708066
H_TO_Gain,1.069174
V_TO_Gain,0.998927


In [48]:
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
X = X.drop(turf,axis=1)
y = dfgame['Tot_Pts']

vifdrop=['H_TD','V_TD','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric','H_Pts','V_Pts',
        'H_Pts_Opp','V_Pts_Opp','Year']
X = X.drop(vifdrop,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
LR = linear_model.LinearRegression()
sca_train = robust.fit_transform(X_train)
LR.fit(sca_train,y_train)
ypred=LR.predict(sca_train)
print(f'MAE = {mean_absolute_error(y_train,ypred)}')
print(f'Intercept = {LR.intercept_}')
print(f'Training Accuracy = {ou_accuracy(y_train,ypred)}')
get_coef(LR.coef_,X_train.columns)

MAE = 10.469111233948643
Intercept = 44.60797817136932
Training Accuracy = 0.5575959933222037


Unnamed: 0,LR Coefficient
V_Off_Pass,3.288813
dome,-2.649674
grass,-2.431341
Wind,-2.424344
H_Off_Pass,1.854922
V_Off_Rush,1.65545
H_Off_Rush,1.306575
H_Def_Rush,1.264562
V_Def_Rush,1.260313
V_Def_RZ_Pct,1.127572


In [52]:
train_ratio = 0.85
path = 'data/df_working5.pkl'
with open(path, 'rb') as f:
    dfgame = pickle.load(f)
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
regress(LR,X_train,X_test,y_train,y_test,cv=5)

MAE_train = 10.733±0.391, Accuracy = 0.519±0.016
[10.2028101  11.35855609 10.94531507 10.57066376 10.58620051]
                LR Coefficient
a_turf                8.101133
matrixturf            7.360245
sportturf             5.745638
V_Off_Pass            4.417193
fieldturf             3.890611
H_Pts                 3.288340
V_Def_Rush            2.834947
Wind                 -2.583794
dome                 -2.523312
grass                 2.345038
H_Rush_Metric         2.312574
H_Pass_Metric        -2.183779
H_Def_Rush            1.734319
H_Off_Pass            1.628639
V_Def_RZ_Pct          1.547547
Year                 -1.466160
H_TD                 -1.400001
H_Off_Rush           -1.332554
V_Off_Rush            1.325435
V_Pass_Metric        -1.266014
V_Puntret            -1.142696
V_TD                 -1.009501
V_Pts_Opp            -0.907220
V_Sacks_Def           0.877824
V_Poss               -0.828311
V_RZ_Pct              0.820892
H_Plays               0.793014
Week                 

In [51]:
train_ratio = 0.85
path = 'data/df_working5.pkl'
with open(path, 'rb') as f:
    dfgame = pickle.load(f)
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts']
X = X.drop(turf,axis=1)
dropped=['H_TD','V_TD','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric','H_Pts','V_Pts',
        'H_Pts_Opp','V_Pts_Opp','Year']
X = X.drop(dropped,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
regress(LR,X_train,X_test,y_train,y_test,cv=5)

MAE_train = 10.723±0.414, Accuracy = 0.526±0.021
[10.07736203 11.31416506 10.98211995 10.58394396 10.65632885]
                LR Coefficient
V_Off_Pass            3.372361
dome                 -2.510796
Wind                 -2.441556
grass                -2.205197
V_Off_Rush            1.616449
H_Off_Pass            1.610824
H_Off_Rush            1.268890
H_Def_Rush            1.185441
V_Def_RZ_Pct          1.141077
V_Def_Rush            1.114805
V_Puntret            -1.064862
H_Def_Pass            1.035004
V_TO_Gain             0.957151
V_Sacks_Def           0.904377
H_Plays               0.836927
V_Poss               -0.789141
H_TO_Gain             0.787617
V_FG_Pct              0.745540
H_TO_Lost            -0.706670
V_Kickret             0.655105
Week                 -0.602512
H_Sacks_Def           0.594372
H_Def_RZ_Pct         -0.566110
H_TD_on_Def          -0.554619
Temperature           0.515679
V_TD_on_Def          -0.483744
V_Tackles_Loss        0.429127
V_Yds_Pen            

In [61]:
# Random Forest
from sklearn import ensemble
import graphviz
train_ratio = 0.85
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
rf_model=ensemble.RandomForestRegressor(random_state=77,n_estimators=200,max_depth=3)
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_train)
importance = rf_model.feature_importances_
# summarize feature importance
coef = pd.DataFrame(importance, X_train.columns, columns=['Importance'])
ind=np.flip(np.argsort(importance))
coef_sort = coef.iloc[ind]
print(f'MAE = {mean_absolute_error(y_train,y_pred)}')
print(f'Training Accuracy = {ou_accuracy(y_train,y_pred)}')
print(coef_sort)

MAE = 10.172695628976479
Training Accuracy = 0.5553264604810997
                Importance
V_Off_Pass        0.190548
H_Off_Pass        0.152090
H_Pts             0.097897
V_Def_RZ_Pct      0.045486
V_Def_Pass        0.033397
H_Off_Rush        0.033317
V_Pts             0.024376
Temperature       0.022585
V_TD              0.022398
H_RZ_Pct          0.021918
H_Pass_Metric     0.021226
H_Def_Pass        0.019303
V_TO_Gain         0.016182
V_Def_Rush        0.015826
V_Puntret         0.015745
V_Plays           0.015640
V_Pts_Opp         0.015142
H_Puntret         0.014516
V_RZ_Pct          0.014457
H_Plays           0.012820
V_Off_Rush        0.012598
H_Def_Rush        0.011548
H_TO_Gain         0.010824
V_Poss            0.010802
Wind              0.010430
H_Yds_Pen         0.010013
H_Rush_Metric     0.009929
H_Pts_Opp         0.009908
V_Kickret         0.009857
H_TD              0.009730
H_Def_RZ_Pct      0.009602
H_Kickret         0.007731
H_FG_Pct          0.007167
H_Poss            

In [64]:
# Random Forest
from sklearn import ensemble
import graphviz
train_ratio = 0.85
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts']
X = X.drop(turf,axis=1)
dropped=['H_TD','V_TD','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric','H_Pts','V_Pts',
        'H_Pts_Opp','V_Pts_Opp','Year']
X = X.drop(dropped,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
rf_model=ensemble.RandomForestRegressor(random_state=77,n_estimators=200,max_depth=3)
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_train)
importance = rf_model.feature_importances_
# summarize feature importance
coef = pd.DataFrame(importance, X_train.columns, columns=['Importance'])
ind=np.flip(np.argsort(importance))
coef_sort = coef.iloc[ind]
print(f'MAE = {mean_absolute_error(y_train,y_pred)}')
print(f'Training Accuracy = {ou_accuracy(y_train,y_pred)}')
print(coef_sort)

MAE = 10.19705804603034
Training Accuracy = 0.5601374570446735
                Importance
V_Off_Pass        0.213473
H_Off_Pass        0.204559
V_Def_RZ_Pct      0.066639
H_Off_Rush        0.059726
V_Def_Pass        0.039469
Temperature       0.031087
H_Def_Pass        0.029454
H_RZ_Pct          0.025417
V_Plays           0.023499
H_Puntret         0.021908
V_TO_Gain         0.021621
V_Def_Rush        0.019023
V_Off_Rush        0.018300
V_RZ_Pct          0.017930
V_Puntret         0.017518
H_TO_Gain         0.017014
V_Poss            0.016224
H_Plays           0.015935
Wind              0.013754
H_Def_Rush        0.013093
H_Def_RZ_Pct      0.010970
H_Kickret         0.010303
H_Yds_Pen         0.010081
V_Kickret         0.009260
H_Poss            0.008204
V_Yds_Pen         0.007953
V_FG_Pct          0.007693
H_Tackles_Loss    0.007391
V_Tackles_Loss    0.006843
V_TO_Lost         0.006810
H_FG_Pct          0.006592
H_TO_Lost         0.006336
grass             0.003412
V_Sacks_Def       0

In [65]:
# Random Forest
from sklearn import ensemble
import graphviz
train_ratio = 0.85
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts']
X = X.drop(turf,axis=1)
dropped=['H_TD','V_TD','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric','H_Pts','V_Pts',
        'H_Pts_Opp','V_Pts_Opp','Year','H_Kickret','H_Yds_Pen','V_Kickret','H_Poss','V_Yds_Pen','V_FG_Pct',
         'H_Tackles_Loss','V_Tackles_Loss','V_TO_Lost','H_FG_Pct','H_TO_Lost','grass','V_Sacks_Def',
         'H_TD_on_Def','Week','H_Sacks_Def','V_TD_on_Def','dome']
X = X.drop(dropped,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
rf_model=ensemble.RandomForestRegressor(random_state=77,n_estimators=200,max_depth=3)
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_train)
importance = rf_model.feature_importances_
# summarize feature importance
coef = pd.DataFrame(importance, X_train.columns, columns=['Importance'])
ind=np.flip(np.argsort(importance))
coef_sort = coef.iloc[ind]
print(f'MAE = {mean_absolute_error(y_train,y_pred)}')
print(f'Training Accuracy = {ou_accuracy(y_train,y_pred)}')
print(coef_sort)

MAE = 10.202294757364566
Training Accuracy = 0.5615120274914089
              Importance
V_Off_Pass      0.224700
H_Off_Pass      0.211933
V_Def_RZ_Pct    0.073866
H_Off_Rush      0.067512
V_Def_Pass      0.049835
Temperature     0.037732
H_Def_Pass      0.036031
H_Puntret       0.027394
H_RZ_Pct        0.026886
V_TO_Gain       0.025892
V_Plays         0.022997
H_TO_Gain       0.022915
V_Puntret       0.022851
V_Poss          0.021124
V_Def_Rush      0.020762
V_RZ_Pct        0.020469
H_Plays         0.019177
V_Off_Rush      0.018637
H_Def_RZ_Pct    0.017463
Wind            0.016443
H_Def_Rush      0.015382


In [90]:
# Random Forest
from sklearn import ensemble
import graphviz
train_ratio = 0.85
X = dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = dfgame['Tot_Pts']
X = X.drop(turf,axis=1)
dropped=['H_TD','V_TD','H_Pass_Metric','V_Pass_Metric','H_Rush_Metric','V_Rush_Metric','H_Pts','V_Pts',
        'H_Pts_Opp','V_Pts_Opp','Year','H_Kickret','H_Yds_Pen','V_Kickret','H_Poss','V_Yds_Pen','V_FG_Pct',
         'H_Tackles_Loss','V_Tackles_Loss','V_TO_Lost','H_FG_Pct','H_TO_Lost','grass','V_Sacks_Def',
         'H_TD_on_Def','Week','H_Sacks_Def','V_TD_on_Def','dome']
X = X.drop(dropped,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
rf_model=ensemble.RandomForestRegressor(random_state=77,n_estimators=200,max_depth=3)
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_train)
importance = rf_model.feature_importances_
# summarize feature importance
coef = pd.DataFrame(importance, X_train.columns, columns=['Importance'])
ind=np.flip(np.argsort(importance))
coef_sort = coef.iloc[ind]
print(f'MAE = {mean_absolute_error(y_train,y_pred)}')
print(f'Training Accuracy = {ou_accuracy(y_train,y_pred)}')
print(coef_sort)

MAE = 10.202294757364566
Training Accuracy = 0.5615120274914089
              Importance
V_Off_Pass      0.224700
H_Off_Pass      0.211933
V_Def_RZ_Pct    0.073866
H_Off_Rush      0.067512
V_Def_Pass      0.049835
Temperature     0.037732
H_Def_Pass      0.036031
H_Puntret       0.027394
H_RZ_Pct        0.026886
V_TO_Gain       0.025892
V_Plays         0.022997
H_TO_Gain       0.022915
V_Puntret       0.022851
V_Poss          0.021124
V_Def_Rush      0.020762
V_RZ_Pct        0.020469
H_Plays         0.019177
V_Off_Rush      0.018637
H_Def_RZ_Pct    0.017463
Wind            0.016443
H_Def_Rush      0.015382


In [None]:
from sklearn import ensemble
import graphviz
train_ratio = 0.85
X = pd.concat([dfgame.drop(['Tot_Pts','Over_Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1),
              pd.get_dummies(dfgame.Home_Team),pd.get_dummies(dfgame.Vis_Team)],axis=1)
y = dfgame['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
rf_model=ensemble.RandomForestRegressor(random_state=77,n_estimators=200,max_depth=3)
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_train)
importance = rf_model.feature_importances_
# summarize feature importance
coef = pd.DataFrame(importance, X_train.columns, columns=['Importance'])
ind=np.flip(np.argsort(importance))
coef_sort = coef.iloc[ind]
print(f'MAE = {mean_absolute_error(y_train,y_pred)}')
print(f'Training Accuracy = {ou_accuracy(y_train,y_pred)}')
print(coef_sort)

In [14]:
train_ratio = 0.85
mae = np.empty(7)
acc = np.empty(7)
ou = np.empty(7)
acc_ext = np.empty(7)
for n in range(7):
    path = 'data/df_working'+str(n+1)+'.pkl'
    with open(path,'rb') as f:
        dfgame = pickle.load(f)
    yy = dfgame['Tot_Pts']
    XX = dfgame.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
    XX = XX.drop(turf,axis=1)
    XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size = 1-train_ratio, random_state=77)
    m, a = regress(LR,XX_train,XX_test,yy_train,yy_test,
                   print_metrics=False,print_coef=False,return_metrics=True)
    ou[n] = mean_absolute_error(yy_test,dfgame.loc[XX_test.index,'Over/Under'])
    mae[n] = m
    acc[n] = a
print(mae)
print(acc)
print(ou-mae)

[11.34933801 11.16744056 10.48705836 10.59635947 11.09379895 10.69871393
 10.22553897]
[0.45892351 0.53012048 0.50645161 0.51590106 0.54509804 0.55339806
 0.51442308]
[-0.83544912 -0.44422627 -0.28513529 -0.24095878 -0.50567634 -0.25767619
 -0.40714275]


In [None]:
fig = plt.figure()
_ = fig.add_subplot(3,1,1)
_ = plt.xlabel('# Games Averaged')
_ = plt.ylabel('RMSE Diff')
_ = plt.ylim(-0.8,0)
#_ = plt.plot(np.arange(1,len(rmse)+1),rmse,color='blue')
#_ = plt.plot(np.arange(1,len(rmse)+1),bench_rmse,color='red')
_ = plt.plot(np.arange(1,len(rmse)+1),bench_rmse-rmse,color='green')
_ = fig8.add_subplot(3,1,2)
#_ = plt.xlabel('# Games Averaged')
_ = plt.ylabel('Accuracy')
_ = plt.ylim(0.49,0.60)
_ = plt.plot(np.arange(1,len(acc)+1),acc,color='blue')
_ = plt.plot(np.arange(1,len(acc)+1),acc_ext,color='red')
_ = fig8.add_subplot(3,1,3)
_ = plt.ylabel('RMSE Diff')
_ = plt.plot(np.arange(1,len(acc)+1),bench_rmse_ext-rmse_ext,color='lightgreen')
_ = plt.ylim(-1.0,-0.0)
fig.show()
#fig8.savefig('plots/fig8.png')

In [None]:
factor=np.arange(0.0,0.75,0.05)
mae=np.empty(len(factor))
acc=np.empty(len(factor))
for ind,n in enumerate(factor):
    path = 'data/df_working6_{:.2f}.pkl'.format(n)
    with open(path, 'rb') as f:
        df = pickle.load(f)
    df.dropna(inplace=True)

    X = df.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
    X = X.drop(turf,axis=1)
    X = X.drop(vifdrop,axis=1)
    y = df['Tot_Pts']
    wtx_train,wtx_test,wty_train,wty_test = train_test_split(X,y,test_size = 1-train_ratio,random_state = 77)
    m, a = regress(LR,wtx_train,wtx_test,wty_train,wty_test,
                   print_metrics=True,print_coef=False,return_metrics=True)
    mae[ind]=m
    acc[ind]=a
print(mae)
print(acc)

In [None]:
from sklearn.linear_model import Lasso
# path = 'data/df_working6_sh.pkl'.format(n)
# with open(path, 'rb') as f:
#     df = pickle.load(f)
# X = df.drop(turf,axis=1)
path = 'data/df_working6.pkl'.format(n)
with open(path, 'rb') as f:
    df = pickle.load(f)
X = df.drop(turf,axis=1)
X = X.drop(['Tot_Pts','Over/Under','Home_Team','Vis_Team','H_Game','V_Game'],axis=1)
y = df['Tot_Pts']
lasso = Lasso(alpha=0.01,normalize=True)
lasso.fit(X,y)
lasso_coef = lasso.fit(X,y).coef_
coef=get_coef(lasso_coef,X.columns)
print(coef)

In [None]:
from sklearn.linear_model import Ridge
cols=['V_Def_RZ_Pct','H_Pts','Wind','V_Pts','V_Tackles_Loss','V_Off_Pass','H_Def_Rush','H_Off_Pass',
      'V_Def_Pass','H_Yds_Pen','V_Def_Rush','H_Def_Pass']
path = 'data/df_working6.pkl'.format(n)
with open(path, 'rb') as f:
    df = pickle.load(f)
X = df.drop(turf,axis=1)
X = X[cols]
y = df['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
alphas=np.logspace(-3, 0, 40)
mae = np.empty(40)
acc = np.empty(40)
for ind,alpha in enumerate(alphas):
    ridge=Ridge(alpha=alpha,normalize=True)
    m,a=regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=False,return_metrics=True)
    mae[ind] = m
    acc[ind] = a
print(mae)
print(acc)

In [None]:
ridge=Ridge(alpha=0.15,normalize=True)
regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=True,return_metrics=False)

In [None]:
from sklearn.linear_model import Ridge
cols=['H_Pts', 'V_Pts', 'H_Off_Pass', 'V_Off_Pass', 'H_Def_Rush','V_Def_Rush', 'H_RZ_Pct', 'V_RZ_Pct', 
      'H_Sacks_Def', 'Wind', 'grass','dome']
path = 'data/df_working6.pkl'
with open(path, 'rb') as f:
    df = pickle.load(f)
X = df.drop(turf,axis=1)
X = X[cols]
y = df['Tot_Pts']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-train_ratio, random_state=77)
alphas=np.logspace(-3, 0, 40)
mae = np.empty(40)
acc = np.empty(40)
for ind,alpha in enumerate(alphas):
    ridge=Ridge(alpha=alpha,normalize=True)
    m,a=regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=False,return_metrics=True)
    mae[ind] = m
    acc[ind] = a
print(mae)
print(acc)

In [None]:
ridge=Ridge(alpha=0.08,normalize=True)
regress(ridge,X_train,X_test,y_train,y_test,print_coef=False,print_metrics=True,return_metrics=False)