## Imports
---

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_val_predict

#### Load the season long data set for QB statistics and DK points from weeks 1 through 9
---

In [3]:
# read in data set
qb_game = pd.read_csv('../data/QB_season_9.csv', index_col=[0])
qb_game

Unnamed: 0,player,pos,team,opp_home,opp,game,week,day,pass_cmp,pass_att,...,rush_att,rush_yds,rush_td,rec,rec_yds,rec_td,fmb,team_win,team_score,opp_score
0,Lamar Jackson,QB,BAL,0.0,IND,5,5,Mon,37,43,...,14,62,0,0,0,0,1,1,31,25
1,Justin Herbert,QB,LAC,0.0,CLE,5,5,Sun,26,43,...,4,29,1,0,0,0,0,1,47,42
2,Tom Brady,QB,TAM,0.0,MIA,5,5,Sun,30,41,...,1,13,0,0,0,0,0,1,45,17
3,Josh Allen,QB,BUF,0.0,WAS,3,3,Sun,32,43,...,4,9,1,0,0,0,0,1,43,21
4,Josh Allen,QB,BUF,1.0,KAN,5,5,Sun,15,26,...,11,59,1,0,0,0,0,1,38,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,Taysom Hill,QB,NOR,0.0,GNB,1,1,Sun,1,1,...,2,1,0,0,0,0,0,1,38,3
302,Case Keenum,QB,CLE,0.0,ARI,6,6,Sun,1,3,...,0,0,0,0,0,0,0,0,14,37
303,John Wolford,QB,LAR,1.0,HOU,8,8,Sun,1,2,...,0,0,0,0,0,0,0,1,38,22
304,Jacob Eason,QB,IND,0.0,LAR,2,2,Sun,2,5,...,0,0,0,0,0,0,0,0,24,27


In [4]:
# take note of the columns before dropping most of them...
qb_game.columns

Index(['player', 'pos', 'team', 'opp_home', 'opp', 'game', 'week', 'day',
       'pass_cmp', 'pass_att', 'pass_pct', 'pass_yds', 'pass_td', 'pass_int',
       'pass_rtg', 'sk', 'sk_yds', 'pass_yda', 'pass_ayda', 'dk_pt',
       'rush_att', 'rush_yds', 'rush_td', 'rec', 'rec_yds', 'rec_td', 'fmb',
       'team_win', 'team_score', 'opp_score'],
      dtype='object')

In [5]:
# we will only be using team, opponent, location, and which week the game was played
X = qb_game[['team','opp_home','opp','week']]
y = qb_game['dk_pt']

In [6]:
# dummify the offensive team
X = pd.get_dummies(X, columns=['team'], drop_first=True)
X

Unnamed: 0,opp_home,opp,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CHI,team_CIN,team_CLE,...,team_NWE,team_NYG,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,0.0,IND,5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,CLE,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,MIA,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.0,WAS,3,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,KAN,5,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,0.0,GNB,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
302,0.0,ARI,6,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
303,1.0,HOU,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
304,0.0,LAR,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# dummify the opponent
X = pd.get_dummies(X, columns=['opp'], drop_first=True)
X

Unnamed: 0,opp_home,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CHI,team_CIN,team_CLE,team_DAL,...,opp_NWE,opp_NYG,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,0.0,5,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.0,5,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
302,0.0,6,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
303,1.0,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
304,0.0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# take note of the shape before splitting the data
X.shape, y.shape

((306, 64), (306,))

In [9]:
# split our data into training and testing with a 70/30 split and random state of 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((214, 64), (214,), (92, 64), (92,))

#### Look at linear regression and regularization models first
---

In [10]:
# set up and run Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.3466900184573025, -0.3623819800801562)

In [11]:
# let's run ridge with baseline parameters
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge.score(X_train, y_train), ridge.score(X_test, y_test)

(0.33267327359550614, -0.2019380491112155)

In [12]:
# now cross validate a Ridge model while searching for the best alpha and score over R2
r_alphas = np.logspace(0, 5, 100)
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)
ridge_cv.fit(X_train, y_train)
ridge_cv.score(X_train, y_train), ridge_cv.score(X_test, y_test)

(0.12369026534338001, 0.03661938298171974)

In [13]:
# let's run LASSO with baseline parameters
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso.score(X_train, y_train), lasso.score(X_test, y_test)

(0.005707909027624747, 0.00807337722649093)

In [14]:
# now cross validate a LASSO model while searching for the best alpha
l_alphas = np.logspace(-3, 3, 100)
lasso_cv = LassoCV(alphas=l_alphas, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_cv.score(X_train, y_train), lasso_cv.score(X_test, y_test)

(0.007034501568992102, 0.014914697418979217)

---
#### The linear and regularization models are largely overfit and perform terribly on testing data. The cross validated Ridge model looks the best based on its R2 scores.
---

#### Take a look at KNN regression
---

In [15]:
# run baseline KNN regression model
knr = KNeighborsRegressor()
knr.fit(X_train, y_train)
knr.score(X_train, y_train), lr.score(X_test, y_test)

(0.205078172626585, -0.3623819800801562)

In [16]:
# setup a range of k neighbors and distance metric to test against
knr_params = {
    'n_neighbors': range(1, 51, 1),
    'metric': ['euclidean', 'manhattan']
}

# grid search on these metrics
knr_gridsearch = GridSearchCV(estimator=KNeighborsRegressor(),
                              param_grid=knr_params,
                              cv=5)

# fit the model based on the parameters above
knr_gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(1, 51)})

In [17]:
# display the best parameters
knr_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 48}

In [18]:
# let's look at the R2 scores for this model
knr_gridsearch.score(X_train, y_train), knr_gridsearch.score(X_test, y_test)

(0.006301937103527888, 0.019324494870144426)

---
#### The baseline model is extremely overfit based on the negative R2 testing score. After searching for and using the best parameters, we still see an overfit model.
---

#### How does a Decision Tree look? Consider Random Forest and AdaBoost
---

In [19]:
# let's look at the baseline Random Forest before grid searching
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.7706310547476004, -0.10980695267417806)

In [20]:
# set up a range of parameter to check against
grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 3, 5, 7]
}

# run the grid search
gs = GridSearchCV(rfr, param_grid=grid, cv=5)

# fit the model based on these parameters
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [None, 3, 5, 7],
                         'min_samples_split': [2, 3, 5, 7],
                         'n_estimators': [100, 150, 200, 250, 300]})

In [21]:
# what are the best parameters?
gs.best_params_

{'max_depth': 3, 'min_samples_split': 3, 'n_estimators': 200}

In [22]:
# what are the R2 scores for the data?
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.170594773658876, -0.010444377095165214)

In [23]:
# set up and run the AdaBoost model using default values
abr = AdaBoostRegressor(random_state=42, n_estimators=100)
abr.fit(X_train, y_train)
abr.score(X_train, y_train), abr.score(X_test, y_test)

(0.09906426857066242, -0.036380750023343955)

---
#### The baseline model for Random Forest has the best training score of all models checked. However, the testing score is negative and indicates overfitting. When grid searching and cross validating, we see the training and testing score come much closer together, but we still see a negative testing R2 score. AdaBoost did not perform better than Random Forest, so we will not pursue this any further considering its inherent time consumption.
---

#### Load in Week 10 player list to run predictions against
---

In [24]:
# read in the week 10 data
qb_pred = pd.read_csv('../data/week_10_QB_preds.csv')
qb_pred.head()

Unnamed: 0,player,team,opp_home,opp,week
0,Tom Brady,TAM,1,WAS,10
1,Lamar Jackson,BAL,1,MIA,10
2,Josh Allen,BUF,1,NYJ,10
3,Matthew Stafford,LAR,1,SFO,10
4,Dak Prescott,DAL,0,ATL,10


In [25]:
# drop the player name and use this dataframe to run predictions on
qb_pred_test = qb_pred.drop(columns='player')
qb_pred_test.head()

Unnamed: 0,team,opp_home,opp,week
0,TAM,1,WAS,10
1,BAL,1,MIA,10
2,BUF,1,NYJ,10
3,LAR,1,SFO,10
4,DAL,0,ATL,10


In [26]:
# take note of the shape
qb_pred_test.shape

(46, 4)

In [27]:
# dummify the offensive team
qb_pred_test = pd.get_dummies(qb_pred_test, columns=['team'], drop_first=True)
qb_pred_test.head()

Unnamed: 0,opp_home,opp,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CLE,team_DAL,team_DEN,...,team_NOR,team_NWE,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,1,WAS,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,MIA,10,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,NYJ,10,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,SFO,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,ATL,10,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# dummify the opponent
qb_pred_test = pd.get_dummies(qb_pred_test, columns=['opp'], drop_first=True)
qb_pred_test.head()

Unnamed: 0,opp_home,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CLE,team_DAL,team_DEN,team_DET,...,opp_NOR,opp_NWE,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,1,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,10,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,10,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,10,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# now observe the shape and realize we need to get to 64 columns
qb_pred_test.shape

(46, 56)

In [30]:
# which columns are we missing
qb_pred_test.columns

Index(['opp_home', 'week', 'team_ATL', 'team_BAL', 'team_BUF', 'team_CAR',
       'team_CLE', 'team_DAL', 'team_DEN', 'team_DET', 'team_GNB', 'team_IND',
       'team_JAX', 'team_KAN', 'team_LAC', 'team_LAR', 'team_LVR', 'team_MIA',
       'team_MIN', 'team_NOR', 'team_NWE', 'team_NYJ', 'team_PHI', 'team_PIT',
       'team_SEA', 'team_SFO', 'team_TAM', 'team_TEN', 'team_WAS', 'opp_ATL',
       'opp_BAL', 'opp_BUF', 'opp_CAR', 'opp_CLE', 'opp_DAL', 'opp_DEN',
       'opp_DET', 'opp_GNB', 'opp_IND', 'opp_JAX', 'opp_KAN', 'opp_LAC',
       'opp_LAR', 'opp_LVR', 'opp_MIA', 'opp_MIN', 'opp_NOR', 'opp_NWE',
       'opp_NYJ', 'opp_PHI', 'opp_PIT', 'opp_SEA', 'opp_SFO', 'opp_TAM',
       'opp_TEN', 'opp_WAS'],
      dtype='object')

In [31]:
# of course, we are missing the teams that are on bye week
qb_pred_test['team_CHI'] = 0
qb_pred_test['opp_CHI'] = 0
qb_pred_test['team_CIN'] = 0
qb_pred_test['opp_CIN'] = 0
qb_pred_test['team_HOU'] = 0
qb_pred_test['opp_HOU'] = 0
qb_pred_test['team_NYG'] = 0
qb_pred_test['opp_NYG'] = 0

#### Apply predictions to the Player Matchups for Week 10
---

In [32]:
# add columns for all of the predictions of interest (best performing models) if we can even say that...
qb_pred['pass_preds_ridgeCV'] = ridge_cv.predict(qb_pred_test)
qb_pred['pass_preds_knr_gs'] = knr_gridsearch.predict(qb_pred_test)
qb_pred['pass_preds_rfr'] = rfr.predict(qb_pred_test)
qb_pred['pass_preds_rfr_gs'] = gs.predict(qb_pred_test)

In [33]:
qb_pred.sort_values(by='pass_preds_ridgeCV', ascending=False).head(20)

Unnamed: 0,player,team,opp_home,opp,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
28,Jacoby Brissett,MIA,0,BAL,10,18.547699,16.58125,22.61475,22.227673
16,Tua Tagovailoa,MIA,0,BAL,10,18.547699,16.58125,22.61475,22.227673
15,Kirk Cousins,MIN,1,LAC,10,17.427018,16.335417,22.728,18.615212
35,Tyler Huntley,BAL,1,MIA,10,17.008973,15.71875,17.201583,17.215674
1,Lamar Jackson,BAL,1,MIA,10,17.008973,15.71875,17.201583,17.215674
5,Justin Herbert,LAC,0,MIN,10,16.716927,16.375,19.0019,18.95801
33,Jordan Love,GNB,0,SEA,10,16.597752,16.775,20.469833,16.735848
11,Aaron Rodgers,GNB,0,SEA,10,16.597752,16.775,20.469833,16.735848
40,Zach Wilson,NYJ,0,BUF,10,16.255803,16.239583,16.447333,16.400536
31,Mike White,NYJ,0,BUF,10,16.255803,16.239583,16.447333,16.400536


---
#### Remember that the top 5 worst defenses by DK points allowed are WAS > KAN > IND > DAL > MIA 

#### RidgeCV has 4 of those top 5 in its top 10 for passing points
#### KNNR GS has 1 of those top 5 in its top 10 for passing points
#### RndFor has 2 of those top 5 in its top 10 for passing points
#### RF GS has 3 of those top 5 in its top 10 for passing points

#### If we are using the null model as a baseline like above, then we should consider RidgeCV as our best model to compare with the actual results from week 10.
---

#### Load in Week 10 player results and null model to compare predictions against
---

In [34]:
# read in week 10 QB results and simplify columns
qb_results = pd.read_csv('../data/week_10_QB_results.csv', index_col=[0])
qb_results = qb_results[['player','team','opp_home','opp','DK_pt']]
qb_results.head()

Unnamed: 0,player,team,opp_home,opp,DK_pt
0,Patrick Mahomes,KAN,1.0,LVR,39.2
1,Dak Prescott,DAL,0.0,ATL,26.3
2,Josh Allen,BUF,1.0,NYJ,24.9
3,Mac Jones,NWE,0.0,CLE,19.9
4,Trevor Siemian,NOR,1.0,TEN,19.9


In [35]:
# read in null model (season average) versus QB data set and simplify columns
qb_null = pd.read_csv('../data/DEF_QB.csv', index_col=[0])
qb_null = qb_null[['opp','DK_ptg']]
qb_null.head()

Unnamed: 0,opp,DK_ptg
0,WAS,26.8
1,KAN,24.0
2,IND,22.9
13,DAL,22.6
3,MIA,22.1


In [36]:
# merge the data frames on opponent and only view the following columns
qb_final = qb_results.merge(qb_null, on='opp', how='outer')
qb_final = pd.DataFrame(data=qb_final, columns=['player','team','DK_pt','opp_home','opp','DK_ptg'])
qb_final.head()

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg
0,Patrick Mahomes,KAN,39.2,1.0,LVR,18.1
1,Dak Prescott,DAL,26.3,0.0,ATL,21.4
2,Cooper Rush,DAL,0.1,0.0,ATL,21.4
3,Josh Allen,BUF,24.9,1.0,NYJ,19.5
4,Mac Jones,NWE,19.9,0.0,CLE,20.6


In [37]:
# now merge with the predictions on the common columns between them
qb_final = qb_final.merge(qb_pred, on=['player','team','opp_home','opp'], how='outer')
qb_final.head()

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
0,Patrick Mahomes,KAN,39.2,1.0,LVR,18.1,10.0,14.279233,16.166667,10.95675,15.895607
1,Dak Prescott,DAL,26.3,0.0,ATL,21.4,10.0,14.884384,16.4125,16.3035,16.257656
2,Cooper Rush,DAL,0.1,0.0,ATL,21.4,10.0,14.884384,16.4125,16.3035,16.257656
3,Josh Allen,BUF,24.9,1.0,NYJ,19.5,10.0,15.619325,16.85625,18.45775,17.297084
4,Mac Jones,NWE,19.9,0.0,CLE,20.6,10.0,15.370614,16.004167,17.5949,16.576417


In [40]:
# view actual week 10 results with the model predictions
qb_final.sort_values(by='DK_pt', ascending=False).head(35)

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
0,Patrick Mahomes,KAN,39.2,1.0,LVR,18.1,10.0,14.279233,16.166667,10.95675,15.895607
1,Dak Prescott,DAL,26.3,0.0,ATL,21.4,10.0,14.884384,16.4125,16.3035,16.257656
3,Josh Allen,BUF,24.9,1.0,NYJ,19.5,10.0,15.619325,16.85625,18.45775,17.297084
4,Mac Jones,NWE,19.9,0.0,CLE,20.6,10.0,15.370614,16.004167,17.5949,16.576417
6,Trevor Siemian,NOR,19.9,1.0,TEN,21.3,10.0,15.359597,15.697917,13.936,16.580364
8,Jalen Hurts,PHI,19.4,1.0,DEN,16.2,10.0,13.311139,15.872917,13.5713,14.779236
9,Derek Carr,LVR,19.2,0.0,KAN,24.0,10.0,16.018417,16.052083,14.573533,16.408825
10,Ryan Tannehill,TEN,18.6,0.0,NOR,20.0,10.0,14.79785,16.004167,15.830817,16.340925
11,Kirk Cousins,MIN,18.5,1.0,LAC,16.2,10.0,17.427018,16.335417,22.728,18.615212
12,Lamar Jackson,BAL,16.4,1.0,MIA,22.1,10.0,17.008973,15.71875,17.201583,17.215674


---
#### Looking at the top 10 QB points for week 10, we see that only 2 of the top 5 null model defensive points allowed made the top 10 actual scores. The top actual score was against a defense that allowed the 25th most points to QBs. Let's list out our top 5 by our top models and see where it ranked with actual QB point totals.

|RidgeCV||RndFor GS||
|---|---|---|---|
|Pred Opp|Actual Opp|Pred Opp|Actual Opp|
|BAL - 18.5|8th - 18.9|BAL - 22.5|8th - 18.9| 
|LAC - 17.4|10th - 18.5|MIN - 19.0|15th - 13.0|
|MIA - 17.0|11th - 16.4|LAC - 18.8|10th - 18.5|
|MIN - 16.7|15th - 13.0|MIA - 17.7|11th - 16.4|
|SEA - 16.6|18th - 11.5|NYJ - 17.6|3rd - 24.9|

#### To be fair Miami played two QBs due to injury for a total of 12.3 + 6.6 = 18.9 total points, so our total for Miami against BAL would have been 18.9 and bumped this ranking to 8th best for the week. I made that change to the table and bumped all other rankings down by one under 8th place. The RidgeCV model actually predicts the actual best outcomes in order, while the Random Forest GS predicts 4 out of 5 the same from RidgeCV but out of order. However, its 5th best prediction was actually the 3rd best actual performance.
---