# NFL Sports Betting Algorithm - LASSO Algorithm

#### By Tai Chowdhury

In [1]:
#Download Important Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from scipy import stats
import warnings
from shapely.geometry import Point
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
from sklearn.model_selection import ShuffleSplit
from sklearn.utils import resample

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

In [2]:
#Ignore Warnings on final

warnings.filterwarnings('ignore')

#Original Data
df = pd.read_csv("NFL_Data_Correct_3.csv")

In [3]:
df.head(5)
#df.tail(5)

Unnamed: 0,Seas,Week,Team,Opponent,Unique_ID,Opponent_ID,Home/Away,Score,Opp_Score,W/L,...,opp_Rec_Drop_Perc,opp_Def_Air,opp_Def_YAC,opp_Def_Blitz,opp_Def_Hrry,opp_Def_QBKD,opp_Def_Sk.1,opp_Def_Pressure,opp_Def_MTkl,opp_Def_MTkl_Perc
0,2018,2,ARI,LAR,2018ARI2,2018LAR2,Away,0,34,L,...,11.111111,132,212,15,9,0,1,10,11,13.75
1,2018,3,ARI,CHI,2018ARI3,2018CHI3,Home,14,16,L,...,0.0,158,68,10,11,2,6,19,7,11.864407
2,2018,4,ARI,SEA,2018ARI4,2018SEA4,Home,17,20,L,...,12.5,81,87,22,7,4,5,16,5,8.064516
3,2018,5,ARI,SFO,2018ARI5,2018SFO5,Away,28,18,W,...,8.695652,139,111,11,0,1,1,2,10,13.513514
4,2018,6,ARI,MIN,2018ARI6,2018MIN6,Away,17,27,L,...,0.0,188,123,12,2,0,3,5,6,11.320755


###  Scanning for missing data or null cells

In [4]:
# summarize the dataset with statistical summary of numeric "float" variables

df.shape

(1376, 214)

In [5]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376 entries, 0 to 1375
Data columns (total 214 columns):
 #    Column                   Non-Null Count  Dtype  
---   ------                   --------------  -----  
 0    Seas                     1376 non-null   int64  
 1    Week                     1376 non-null   int64  
 2    Team                     1376 non-null   object 
 3    Opponent                 1376 non-null   object 
 4    Unique_ID                1376 non-null   object 
 5    Opponent_ID              1376 non-null   object 
 6    Home/Away                1376 non-null   object 
 7    Score                    1376 non-null   int64  
 8    Opp_Score                1376 non-null   int64  
 9    W/L                      1376 non-null   object 
 10   Lag_Score                1376 non-null   int64  
 11   Prev_W/L                 1376 non-null   int64  
 12   Penalties                1376 non-null   int64  
 13   Penalty_Yds              1376 non-null   int64  
 14   Pass_C

In [6]:
# Check for duplicates
df.duplicated().sum()

0

In [7]:
# Check for null values - We do not have any null values

df.isnull().sum()



Seas                 0
Week                 0
Team                 0
Opponent             0
Unique_ID            0
                    ..
opp_Def_QBKD         0
opp_Def_Sk.1         0
opp_Def_Pressure     0
opp_Def_MTkl         0
opp_Def_MTkl_Perc    0
Length: 214, dtype: int64

In [8]:
df_train = df[df['Seas'] < 2020]
df_train = df_train.sort_values('Seas')
df_train

df_test =df[df['Seas'] == 2020]

In [9]:
df_train = df_train.drop(columns = ['Seas','Team','Opponent','Home/Away','Unique_ID','Opponent_ID','Opp_Score','W/L'])
df_test = df_test.drop(columns = ['Seas','Team','Opponent','Home/Away','Unique_ID','Opponent_ID','Opp_Score','W/L'])



In [10]:
X = df_train.loc[:,df_train.columns != 'Score']
y = df_train['Score']

In [11]:
from sklearn.model_selection import cross_validate

def EvaluateRegressionEstimator(regEstimator, X, y, cv):
    
    scores = cross_validate(regEstimator, X, y, scoring=errorScoring, cv=cv, return_train_score=True)

    #cross val score sign-flips the outputs of MAE
    # https://github.com/scikit-learn/scikit-learn/issues/2439
    scores['test_MAE'] = scores['test_MAE'] * -1
    scores['test_MAPE'] = scores['test_MAPE'] * -1
    scores['test_RMSE'] = scores['test_RMSE'] * -1

    #print mean MAE for all folds 
    maeAvg = scores['test_MAE'].mean()
    print_str = "The average MAE for all cv folds is: \t\t\t {maeAvg:.5}"
    print(print_str.format(maeAvg=maeAvg))

    #print mean test_MAPE for all folds
    scores['test_MAPE'] = scores['test_MAPE']
    mape_avg = scores['test_MAPE'].mean()
    print_str = "The average MAE percentage (MAPE) for all cv folds is: \t {mape_avg:.5}"
    print(print_str.format(mape_avg=mape_avg))

    #print mean MAE for all folds 
    RMSEavg = scores['test_RMSE'].mean()
    print_str = "The average RMSE for all cv folds is: \t\t\t {RMSEavg:.5}"
    print(print_str.format(RMSEavg=RMSEavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['MAE'] = scores['test_MAE']
    scoresResults['MAPE'] = scores['test_MAPE']
    scoresResults['RMSE'] = scores['test_RMSE']
    return scoresResults

In [12]:
#Use mean absolute error (MAE) to score the regression models created 
#(the scale of MAE is identical to the response variable)
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error

#Function for Root mean squared error
#https://stackoverflow.com/questions/17197492/root-mean-square-error-in-python
def rmse(y_actual, y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

#Function for Mean Absolute Percentage Error (MAPE) - Untested
#Adapted from - https://stackoverflow.com/questions/42250958/how-to-optimize-mape-code-in-python
def mape(y_actual, y_predicted): 
    mask = y_actual != 0
    return (np.fabs(y_actual - y_predicted)/y_actual)[mask].mean() * 100

#Create scorers for rmse and mape functions
mae_scorer = make_scorer(score_func=mean_absolute_error, greater_is_better=False)
rmse_scorer = make_scorer(score_func=rmse, greater_is_better=False)
mape_scorer = make_scorer(score_func=mape, greater_is_better=False)

#Make scorer array to pass into cross_validate() function for producing mutiple scores for each cv fold.
errorScoring = {'MAE':  mae_scorer, 
                'RMSE': rmse_scorer,
                'MAPE': mape_scorer
               }

In [13]:
cv = ShuffleSplit(n_splits = 10, test_size = .2, random_state = 27)

In [14]:
#Create a regression object and perform a grid search to find the best parameters
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV

lasso_reg = Lasso(fit_intercept=True, normalize=True,copy_X=True
          , max_iter=10000, precompute=True, tol=0.0001, random_state=27)

alpha = [0.001, 0.1, 1, 10, 20]
selection = ['cyclic','random']
warm_start = [True, False]
parameters = {'alpha': alpha, 'selection': selection, 'warm_start': warm_start}

regGridSearch = GridSearchCV(estimator=lasso_reg
                   , n_jobs=8 
                   , verbose=1
                   , param_grid=parameters
                   , cv=cv 
                   , scoring=mae_scorer)

regGridSearch.fit(X, y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=27, test_size=0.2, train_size=None),
             estimator=Lasso(max_iter=10000, normalize=True, precompute=True,
                             random_state=27),
             n_jobs=8,
             param_grid={'alpha': [0.001, 0.1, 1, 10, 20],
                         'selection': ['cyclic', 'random'],
                         'warm_start': [True, False]},
             scoring=make_scorer(mean_absolute_error, greater_is_better=False),
             verbose=1)

In [15]:
#Display the best estimator parameters
regGridSearch.best_estimator_

Lasso(alpha=0.1, max_iter=10000, normalize=True, precompute=True,
      random_state=27, warm_start=True)

In [16]:
from sklearn.linear_model import Lasso

#Create a regression estimator with best parameters for cross validation
regEstimator = Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=10000,
                     normalize=True, positive=False, precompute=True, random_state=0,
                     selection='cyclic', tol=0.0001, warm_start=True)

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics.
EvaluateRegressionEstimator(regEstimator, X, y, cv)

The average MAE for all cv folds is: 			 8.1159
The average MAE percentage (MAPE) for all cv folds is: 	 55.018
The average RMSE for all cv folds is: 			 10.188
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,MAE,MAPE,RMSE
0,8.106642,51.686242,10.123861
1,8.66586,53.985575,10.860272
2,7.908318,62.926295,10.273921
3,8.085459,56.390257,10.25802
4,7.272222,51.431534,9.185548
5,8.135475,65.816441,10.170719
6,8.186034,50.719013,10.140205
7,8.810025,60.147092,10.934973
8,7.743327,44.691372,9.645102
9,8.245779,52.38599,10.283823


In [17]:
X = df_test.loc[:,df_test.columns != 'Score']
y = df_test['Score']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2,random_state = 27)

In [19]:
from sklearn.metrics import mean_squared_error
import math

regEstimator.fit(X_train,y_train)
preds = regEstimator.predict(X_test)
MSE = mean_squared_error(y_test,preds)
RMSE = math.sqrt(MSE)
RMSE

9.567308879453542

In [20]:
preds.shape

(96,)

In [21]:
pd.options.display.max_rows = 480
preds = pd.DataFrame(regEstimator.predict(X))
preds

Unnamed: 0,0
0,24.840748
1,24.840748
2,24.508068
3,24.840748
4,24.840748
5,25.506109
6,25.173428
7,25.173428
8,24.508068
9,24.840748


In [22]:
preds.to_csv('data.csv')

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

In [None]:
pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
])

In [None]:
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=3
                      )

In [None]:
search.fit(X_train,y_train)

In [None]:
search.best_params_

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_

In [None]:
importance = np.abs(coefficients)
features = X.columns.values

In [None]:
np.array(features)[importance > 0]

In [None]:
np.array(features)[importance == 0]