## Imports
---

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_val_predict

#### Load the season long data set for WR statistics and DK points from weeks 1 through 9
---

In [2]:
# read in data set
wr_game = pd.read_csv('../data/WR_season_9.csv', index_col=[0])
wr_game

Unnamed: 0,player,pos,team,opp_home,opp,game,week,day,rec_tgt,rec,...,pass_yds,pass_td,pass_int,rush_att,rush_yds,rush_td,fmb,team_win,team_score,opp_score
0,Tyreek Hill,WR,KAN,1.0,PHI,4,4,Sun,12,11,...,0,0,0,0,0,0,0,1,42,30
1,Amari Cooper,WR,DAL,1.0,TAM,1,1,Thu,17,13,...,0,0,0,0,0,0,0,0,29,31
2,Davante Adams,WR,GNB,1.0,CIN,5,5,Sun,16,11,...,0,0,0,0,0,0,0,1,25,22
3,Cooper Kupp,WR,LAR,0.0,DET,7,7,Sun,13,10,...,0,0,0,0,0,0,0,1,28,19
4,Tyreek Hill,WR,KAN,0.0,CLE,1,1,Sun,15,11,...,0,0,0,1,4,0,0,1,33,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,James Washington,WR,PIT,0.0,LVR,2,2,Sun,1,0,...,0,0,0,0,0,0,0,0,17,26
1082,Dede Westbrook,WR,MIN,1.0,BAL,8,9,Sun,1,0,...,0,0,0,0,0,0,0,0,31,34
1083,Cody White,WR,PIT,0.0,DEN,5,5,Sun,1,0,...,0,0,0,0,0,0,0,1,27,19
1084,Kevin White,WR,NOR,1.0,SEA,6,7,Mon,2,0,...,0,0,0,0,0,0,0,1,13,10


In [3]:
# take note of the columns before dropping most of them...
wr_game.columns

Index(['player', 'pos', 'team', 'opp_home', 'opp', 'game', 'week', 'day',
       'rec_tgt', 'rec', 'rec_yds', 'rec_ydr', 'rec_td', 'rec_pct',
       'rec_ydtgt', 'dk_pt', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td',
       'pass_int', 'rush_att', 'rush_yds', 'rush_td', 'fmb', 'team_win',
       'team_score', 'opp_score'],
      dtype='object')

In [4]:
# we will only be using team, opponent, location, and which week the game was played
X = wr_game[['team','opp_home','opp','week']]
y = wr_game['dk_pt']

In [5]:
# dummify the offensive team
X = pd.get_dummies(X, columns=['team'], drop_first=True)
X

Unnamed: 0,opp_home,opp,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CHI,team_CIN,team_CLE,...,team_NWE,team_NYG,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,1.0,PHI,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,TAM,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,CIN,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,DET,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,CLE,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,0.0,LVR,2,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1082,1.0,BAL,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1083,0.0,DEN,5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1084,1.0,SEA,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# dummify the opponent
X = pd.get_dummies(X, columns=['opp'], drop_first=True)
X

Unnamed: 0,opp_home,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CHI,team_CIN,team_CLE,team_DAL,...,opp_NWE,opp_NYG,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,1.0,4,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1.0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,1.0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,0.0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1082,1.0,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1083,0.0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1084,1.0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [7]:
# take note of the shape before splitting the data
X.shape, y.shape

((1086, 64), (1086,))

In [8]:
# split our data into training and testing with a 70/30 split and random state of 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((760, 64), (760,), (326, 64), (326,))

#### Look at linear regression and regularization models first
---

In [9]:
# set up and run Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.08872438410501826, -0.03191674261917132)

In [10]:
# let's run ridge with baseline parameters
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge.score(X_train, y_train), ridge.score(X_test, y_test)

(0.08813901265707658, -0.02445650098092167)

In [11]:
# now cross validate a Ridge model while searching for the best alpha and score over R2
r_alphas = np.logspace(0, 5, 100)
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)
ridge_cv.fit(X_train, y_train)
ridge_cv.score(X_train, y_train), ridge_cv.score(X_test, y_test)

(0.0009828195642106463, 0.0009274212502097257)

In [12]:
# let's run LASSO with baseline parameters
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso.score(X_train, y_train), lasso.score(X_test, y_test)

(0.003084239117657872, 0.0037792572506032407)

In [13]:
# now cross validate a LASSO model while searching for the best alpha
l_alphas = np.logspace(-3, 3, 100)
lasso_cv = LassoCV(alphas=l_alphas, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_cv.score(X_train, y_train), lasso_cv.score(X_test, y_test)

(0.0, -0.0002474426122167639)

---
#### The linear and regularization models are largely overfit and perform terribly on testing data. All models hug zero very closely as these models perform the worst out of the other position data.
---

#### Take a look at KNN regression
---

In [14]:
# run baseline KNN regression model
knr = KNeighborsRegressor()
knr.fit(X_train, y_train)
knr.score(X_train, y_train), lr.score(X_test, y_test)

(0.19887024496839778, -0.03191674261917132)

In [15]:
# setup a range of k neighbors and distance metric to test against
knr_params = {
    'n_neighbors': range(1, 51, 1),
    'metric': ['euclidean', 'manhattan']
}

# grid search on these metrics
knr_gridsearch = GridSearchCV(estimator=KNeighborsRegressor(),
                              param_grid=knr_params,
                              cv=5)

# fit the model based on the parameters above
knr_gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(1, 51)})

In [16]:
# display the best parameters
knr_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 39}

In [17]:
# let's look at the R2 scores for this model
knr_gridsearch.score(X_train, y_train), knr_gridsearch.score(X_test, y_test)

(0.026863749114852786, 0.006020851193347498)

---
#### The baseline model is overfit based on the negative R2 testing score. After searching for and using the best parameters, we still see the training and testing become closer but still remain close to zero.
---

#### How does a Decision Tree look? Consider Random Forest and AdaBoost
---

In [18]:
# let's look at the baseline Random Forest before grid searching
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.3378534106430937, -0.6019150509192601)

In [19]:
# set up a range of parameter to check against
grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 3, 5, 7]
}

# run the grid search
gs = GridSearchCV(rfr, param_grid=grid, cv=5)

# fit the model based on these parameters
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [None, 3, 5, 7],
                         'min_samples_split': [2, 3, 5, 7],
                         'n_estimators': [100, 150, 200, 250, 300]})

In [20]:
# what are the best parameters?
gs.best_params_

{'max_depth': 3, 'min_samples_split': 7, 'n_estimators': 250}

In [21]:
# what are the R2 scores for the data?
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.0671661427721062, 0.006112251469108099)

In [22]:
# set up and run the AdaBoost model using default values
abr = AdaBoostRegressor(random_state=42, n_estimators=100)
abr.fit(X_train, y_train)
abr.score(X_train, y_train), abr.score(X_test, y_test)

(-0.08355075657320543, -0.28038574419845186)

---
#### The baseline model for Random Forest has the best training score of all models checked. However, the testing score is massively negative and indicates overfitting. When grid searching and cross validating, we see the training and testing score come much closer together, but they remain close to zero. AdaBoost did not perform better than Random Forest, so we will not pursue this any further considering its inherent time consumption.
---

#### Load in Week 10 player list to run predictions against
---

In [23]:
# read in the week 10 data
wr_pred = pd.read_csv('../data/week_10_WR_preds.csv')
wr_pred.head()

Unnamed: 0,player,team,opp_home,opp,week
0,Cooper Kupp,LAR,1,SFO,10
1,Deebo Samuel,SFO,0,LAR,10
2,Marquise Brown,BAL,1,MIA,10
3,Davante Adams,GNB,0,SEA,10
4,D.K. Metcalf,SEA,1,GNB,10


In [24]:
# drop the player name and use this dataframe to run predictions on
wr_pred_test = wr_pred.drop(columns='player')
wr_pred_test.head()

Unnamed: 0,team,opp_home,opp,week
0,LAR,1,SFO,10
1,SFO,0,LAR,10
2,BAL,1,MIA,10
3,GNB,0,SEA,10
4,SEA,1,GNB,10


In [25]:
# take note of the shape
wr_pred_test.shape

(149, 4)

In [26]:
# dummify the offensive team
wr_pred_test = pd.get_dummies(wr_pred_test, columns=['team'], drop_first=True)
wr_pred_test.head()

Unnamed: 0,opp_home,opp,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CLE,team_DAL,team_DEN,...,team_NOR,team_NWE,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,1,SFO,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,LAR,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,MIA,10,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,SEA,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,GNB,10,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [27]:
# dummify the opponent
wr_pred_test = pd.get_dummies(wr_pred_test, columns=['opp'], drop_first=True)
wr_pred_test.head()

Unnamed: 0,opp_home,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CLE,team_DAL,team_DEN,team_DET,...,opp_NOR,opp_NWE,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,1,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,10,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# now observe the shape and realize we need to get to 64 columns
wr_pred_test.shape

(149, 56)

In [29]:
# which columns are we missing
wr_pred_test.columns

Index(['opp_home', 'week', 'team_ATL', 'team_BAL', 'team_BUF', 'team_CAR',
       'team_CLE', 'team_DAL', 'team_DEN', 'team_DET', 'team_GNB', 'team_IND',
       'team_JAX', 'team_KAN', 'team_LAC', 'team_LAR', 'team_LVR', 'team_MIA',
       'team_MIN', 'team_NOR', 'team_NWE', 'team_NYJ', 'team_PHI', 'team_PIT',
       'team_SEA', 'team_SFO', 'team_TAM', 'team_TEN', 'team_WAS', 'opp_ATL',
       'opp_BAL', 'opp_BUF', 'opp_CAR', 'opp_CLE', 'opp_DAL', 'opp_DEN',
       'opp_DET', 'opp_GNB', 'opp_IND', 'opp_JAX', 'opp_KAN', 'opp_LAC',
       'opp_LAR', 'opp_LVR', 'opp_MIA', 'opp_MIN', 'opp_NOR', 'opp_NWE',
       'opp_NYJ', 'opp_PHI', 'opp_PIT', 'opp_SEA', 'opp_SFO', 'opp_TAM',
       'opp_TEN', 'opp_WAS'],
      dtype='object')

In [30]:
# of course, we are missing the teams that are on bye week
wr_pred_test['team_CHI'] = 0
wr_pred_test['opp_CHI'] = 0
wr_pred_test['team_CIN'] = 0
wr_pred_test['opp_CIN'] = 0
wr_pred_test['team_HOU'] = 0
wr_pred_test['opp_HOU'] = 0
wr_pred_test['team_NYG'] = 0
wr_pred_test['opp_NYG'] = 0

#### Apply predictions to the Player Matchups for Week 10
---

In [31]:
# add columns for all of the predictions of interest (best performing models) if we can even say that...
wr_pred['pass_preds_ridgeCV'] = ridge_cv.predict(wr_pred_test)
wr_pred['pass_preds_knr_gs'] = knr_gridsearch.predict(wr_pred_test)
wr_pred['pass_preds_rfr'] = rfr.predict(wr_pred_test)
wr_pred['pass_preds_rfr_gs'] = gs.predict(wr_pred_test)

In [32]:
wr_pred.sort_values(by='pass_preds_knr_gs', ascending=False).head(10)

Unnamed: 0,player,team,opp_home,opp,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
63,Russell Gage,ATL,1,DAL,10,9.039333,9.266667,8.127578,8.678043
136,Christian Blake,ATL,1,DAL,10,9.039333,9.266667,8.127578,8.678043
67,Olamide Zaccheaus,ATL,1,DAL,10,9.039333,9.266667,8.127578,8.678043
83,Tajae Sharpe,ATL,1,DAL,10,9.039333,9.266667,8.127578,8.678043
34,Deonte Harris,NOR,1,TEN,10,9.040371,9.089744,10.47816,8.715453
121,Kevin White,NOR,1,TEN,10,9.040371,9.089744,10.47816,8.715453
96,Kenny Stills,NOR,1,TEN,10,9.040371,9.089744,10.47816,8.715453
30,Marquez Callaway,NOR,1,TEN,10,9.040371,9.089744,10.47816,8.715453
71,Tre'Quan Smith,NOR,1,TEN,10,9.040371,9.089744,10.47816,8.715453
106,Ty Montgomery,NOR,1,TEN,10,9.040371,9.089744,10.47816,8.715453


---
#### Remember that the top 5 worst defenses by DK points allowed are TEN > WAS > MIA > IND > MIN  

#### RidgeCV has 2 of those top 5 in its top 10 for wide receiver points
#### KNNR GS has 2 of those top 5 in its top 10 for wide receiver points 
#### RndFor has 3 of those top 5 in its top 10 for wide receiver points 
#### RF GS has 2 of those top 5 in its top 10 for wide receiver points 

#### If we are using the null model as a baseline like above, then we should consider Random Forest as our best model to compare with the actual results from week 10 as well as RidgeCV or KNNR GS.
---

#### Load in Week 10 player results and null model to compare predictions against
---

In [33]:
# read in week 10 TE results and simplify columns
wr_results = pd.read_csv('../data/week_10_WR_results.csv', index_col=[0])
wr_results = wr_results[['player','team','opp_home','opp','DK_pt']]
wr_results.head()

Unnamed: 0,player,team,opp_home,opp,DK_pt
0,Stefon Diggs,BUF,1.0,NYJ,33.2
1,CeeDee Lamb,DAL,0.0,ATL,28.6
2,Tyreek Hill,KAN,1.0,LVR,27.5
3,Justin Jefferson,MIN,1.0,LAC,25.9
4,Kendrick Bourne,NWE,0.0,CLE,24.1


In [34]:
# read in null model (season average) versus TE data set and simplify columns
wr_null = pd.read_csv('../data/DEF_WR.csv', index_col=[0])
wr_null = wr_null[['opp','DK_ptg']]
wr_null.head()

Unnamed: 0,opp,DK_ptg
0,TEN,48.0
4,WAS,43.8
2,MIA,43.7
1,IND,43.1
7,MIN,42.6


In [35]:
# merge the data frames on opponent and only view the following columns
wr_final = wr_results.merge(wr_null, on='opp', how='outer')
wr_final = pd.DataFrame(data=wr_final, columns=['player','team','DK_pt','opp_home','opp','DK_ptg'])
wr_final.head()

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg
0,Stefon Diggs,BUF,33.2,1.0,NYJ,32.1
1,Gabriel Davis,BUF,16.5,1.0,NYJ,32.1
2,Isaiah McKenzie,BUF,9.9,1.0,NYJ,32.1
3,Emmanuel Sanders,BUF,7.1,1.0,NYJ,32.1
4,Cole Beasley,BUF,3.5,1.0,NYJ,32.1


In [36]:
# now merge with the predictions on the common columns between them
wr_final = wr_final.merge(wr_pred, on=['player','team','opp_home','opp'], how='outer')
wr_final.head()

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
0,Stefon Diggs,BUF,33.2,1.0,NYJ,32.1,10.0,9.039381,8.210256,7.623656,8.386541
1,Gabriel Davis,BUF,16.5,1.0,NYJ,32.1,10.0,9.039381,8.210256,7.623656,8.386541
2,Isaiah McKenzie,BUF,9.9,1.0,NYJ,32.1,10.0,9.039381,8.210256,7.623656,8.386541
3,Emmanuel Sanders,BUF,7.1,1.0,NYJ,32.1,10.0,9.039381,8.210256,7.623656,8.386541
4,Cole Beasley,BUF,3.5,1.0,NYJ,32.1,10.0,9.039381,8.210256,7.623656,8.386541


In [39]:
# view actual week 10 results with the model predictions
wr_final.sort_values(by='DK_pt', ascending=False).head(30)

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
0,Stefon Diggs,BUF,33.2,1.0,NYJ,32.1,10.0,9.039381,8.210256,7.623656,8.386541
5,CeeDee Lamb,DAL,28.6,0.0,ATL,36.4,10.0,9.038566,7.392308,11.655864,8.960273
10,Tyreek Hill,KAN,27.5,1.0,LVR,36.5,10.0,9.039286,8.289744,8.562474,8.613559
15,Justin Jefferson,MIN,25.9,1.0,LAC,28.4,10.0,9.041015,8.671795,11.297943,10.894378
18,Kendrick Bourne,NWE,24.1,0.0,CLE,36.4,10.0,9.036019,6.789744,7.666658,8.435542
22,DeVonta Smith,PHI,22.6,1.0,DEN,34.9,10.0,9.036853,6.925641,3.740668,7.422189
25,Marcus Johnson,TEN,18.0,0.0,NOR,42.6,10.0,9.037924,7.730769,10.80122,8.879406
30,Keenan Allen,LAC,17.8,0.0,MIN,42.6,10.0,9.038625,7.851282,9.812981,9.054879
33,Bryan Edwards,LVR,17.8,0.0,KAN,34.8,10.0,9.039494,7.261538,6.593808,8.87539
34,Hunter Renfrow,LVR,17.6,0.0,KAN,34.8,10.0,9.039494,7.261538,6.593808,8.87539


---
#### Looking at the top 10 WR points for week 10, we see that only 1 of the top 5 null model defensive points allowed made the top 10 actual scores. The top actual score was against a defense that allowed the 27th most points to WRs. Let's list out our top 5 by our top models and see where it ranked with actual WR point totals.

|KNNR GS||Random Forest||
|---|---|---|---|
|Pred Opp|Actual Opp|Pred Opp|Actual Opp|
|DAL - 9.3|63rd - 4.2|MIN - 12.1|8th - 17.8| 
|TEN - 9.1|18th - 14.4|LAC - 11.9|4th - 25.9|
|MIA - 9.0|21st - 14.0|BAL - 11.7|17th - 14.6|
|GNB - 9.0|49th - 5.6|PIT - 11.5|36th - 10.1|
|LAC - 8.7|4th - 25.9|GNB - 10.9|49th - 5.6|

#### Both models perform poorly overall with both models predicting the 8th best in their top 5 and RFR additionally picking 8th best performance as it highest predicted performer. The Wide Receiver data set is the largest dataset and as a result yielded the worst results. The higher the data count the worse the model is. 
---