## Imports
---

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_val_predict

#### Load the season long data set for TE statistics and DK points from weeks 1 through 9
---

In [2]:
# read in data set
te_game = pd.read_csv('../data/TE_season_9.csv', index_col=[0])
te_game

Unnamed: 0,player,pos,team,opp_home,opp,game,week,day,rec_tgt,rec,...,pass_yds,pass_td,pass_int,rush_att,rush_yds,rush_td,fmb,team_win,team_score,opp_score
0,Mark Andrews,TE,BAL,0.0,IND,5,5,Mon,13,11,...,0,0,0,0,0,0,0,1,31,25
1,David Njoku,TE,CLE,1.0,LAC,5,5,Sun,7,7,...,0,0,0,0,0,0,0,0,42,47
2,Kyle Pitts,TE,ATL,0.0,NYJ,5,5,Sun,10,9,...,0,0,0,0,0,0,0,1,27,20
3,Darren Waller,TE,LVR,0.0,BAL,1,1,Mon,19,10,...,0,0,0,0,0,0,0,1,33,27
4,Rob Gronkowski,TE,TAM,0.0,DAL,1,1,Thu,8,8,...,0,0,0,0,0,0,0,1,31,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,Tommy Tremble,TE,CAR,1.0,DAL,4,4,Sun,1,0,...,0,0,0,0,0,0,0,0,28,36
537,C.J. Uzomah,TE,CIN,1.0,PIT,3,3,Sun,1,0,...,0,0,0,0,0,0,0,1,24,10
538,Trevon Wesco,TE,NYJ,0.0,TEN,4,4,Sun,1,0,...,0,0,0,0,0,0,0,1,27,24
539,Trevon Wesco,TE,NYJ,1.0,NWE,6,7,Sun,1,0,...,0,0,0,0,0,0,0,0,13,54


In [3]:
# take note of the columns before dropping most of them...
te_game.columns

Index(['player', 'pos', 'team', 'opp_home', 'opp', 'game', 'week', 'day',
       'rec_tgt', 'rec', 'rec_yds', 'rec_ydr', 'rec_td', 'rec_pct',
       'rec_ydtgt', 'dk_pt', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td',
       'pass_int', 'rush_att', 'rush_yds', 'rush_td', 'fmb', 'team_win',
       'team_score', 'opp_score'],
      dtype='object')

In [4]:
# we will only be using team, opponent, location, and which week the game was played
X = te_game[['team','opp_home','opp','week']]
y = te_game['dk_pt']

In [5]:
# dummify the offensive team
X = pd.get_dummies(X, columns=['team'], drop_first=True)
X

Unnamed: 0,opp_home,opp,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CHI,team_CIN,team_CLE,...,team_NWE,team_NYG,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,0.0,IND,5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,LAC,5,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0.0,NYJ,5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,BAL,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,DAL,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,1.0,DAL,4,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537,1.0,PIT,3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
538,0.0,TEN,4,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
539,1.0,NWE,7,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [6]:
# dummify the opponent
X = pd.get_dummies(X, columns=['opp'], drop_first=True)
X

Unnamed: 0,opp_home,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CHI,team_CIN,team_CLE,team_DAL,...,opp_NWE,opp_NYG,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,0.0,5,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,5,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,1.0,4,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537,1.0,3,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
538,0.0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
539,1.0,7,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [7]:
# take note of the shape before splitting the data
X.shape, y.shape

((541, 64), (541,))

In [8]:
# split our data into training and testing with a 70/30 split and random state of 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((378, 64), (378,), (163, 64), (163,))

#### Look at linear regression and regularization models first
---

In [9]:
# set up and run Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.23327596899104097, -0.08455489669493055)

In [10]:
# let's run ridge with baseline parameters
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge.score(X_train, y_train), ridge.score(X_test, y_test)

(0.23126380592979856, -0.06542024090602161)

In [11]:
# now cross validate a Ridge model while searching for the best alpha and score over R2
r_alphas = np.logspace(0, 5, 100)
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)
ridge_cv.fit(X_train, y_train)
ridge_cv.score(X_train, y_train), ridge_cv.score(X_test, y_test)

(0.13841984273756391, 0.007659139859432562)

In [12]:
# let's run LASSO with baseline parameters
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso.score(X_train, y_train), lasso.score(X_test, y_test)

(0.0, -0.005332199876449284)

In [13]:
# now cross validate a LASSO model while searching for the best alpha
l_alphas = np.logspace(-3, 3, 100)
lasso_cv = LassoCV(alphas=l_alphas, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_cv.score(X_train, y_train), lasso_cv.score(X_test, y_test)

(0.1111688995842276, 0.0030518133097791056)

---
#### The linear and regularization models are largely overfit and perform terribly on testing data. The cross validated Ridge and LASSO models do yield a positive testing R2 score with the Ridge training and testing scores slightly higher than the LASSO model.
---

#### Take a look at KNN regression
---

In [14]:
# run baseline KNN regression model
knr = KNeighborsRegressor()
knr.fit(X_train, y_train)
knr.score(X_train, y_train), lr.score(X_test, y_test)

(0.20381723870185553, -0.08455489669493055)

In [15]:
# setup a range of k neighbors and distance metric to test against
knr_params = {
    'n_neighbors': range(1, 51, 1),
    'metric': ['euclidean', 'manhattan']
}

# grid search on these metrics
knr_gridsearch = GridSearchCV(estimator=KNeighborsRegressor(),
                              param_grid=knr_params,
                              cv=5)

# fit the model based on the parameters above
knr_gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(1, 51)})

In [16]:
# display the best parameters
knr_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 41}

In [17]:
# let's look at the R2 scores for this model
knr_gridsearch.score(X_train, y_train), knr_gridsearch.score(X_test, y_test)

(0.027007309195751672, -0.016446776689680753)

---
#### The baseline model is overfit based on the negative R2 testing score. After searching for and using the best parameters, we still see an overfit model with negative R2 score for testing data.
---

#### How does a Decision Tree look? Consider Random Forest and AdaBoost
---

In [18]:
# let's look at the baseline Random Forest before grid searching
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.5943472865282087, -0.23552816955208256)

In [19]:
# set up a range of parameter to check against
grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 3, 5, 7]
}

# run the grid search
gs = GridSearchCV(rfr, param_grid=grid, cv=5)

# fit the model based on these parameters
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [None, 3, 5, 7],
                         'min_samples_split': [2, 3, 5, 7],
                         'n_estimators': [100, 150, 200, 250, 300]})

In [20]:
# what are the best parameters?
gs.best_params_

{'max_depth': 3, 'min_samples_split': 7, 'n_estimators': 250}

In [21]:
# what are the R2 scores for the data?
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.13595435189537708, -0.02640315196702603)

In [22]:
# set up and run the AdaBoost model using default values
abr = AdaBoostRegressor(random_state=42, n_estimators=100)
abr.fit(X_train, y_train)
abr.score(X_train, y_train), abr.score(X_test, y_test)

(-0.14090243682937564, -0.08668882482897833)

---
#### The baseline model for Random Forest has the best training score of all models checked. However, the testing score is considerably negative and indicates overfitting. When grid searching and cross validating, we see the training and testing score come much closer together, but we still see a negative testing R2 score. AdaBoost did not perform better than Random Forest, so we will not pursue this any further considering its inherent time consumption.
---

#### Load in Week 10 player list to run predictions against
---

In [23]:
# read in the week 10 data
te_pred = pd.read_csv('../data/week_10_TE_preds.csv')
te_pred.head()

Unnamed: 0,player,team,opp_home,opp,week
0,Travis Kelce,KAN,1,LVR,10
1,Mark Andrews,BAL,1,MIA,10
2,Rob Gronkowski,TAM,1,WAS,10
3,Dawson Knox,BUF,1,NYJ,10
4,Darren Waller,LVR,0,KAN,10


In [24]:
# drop the player name and use this dataframe to run predictions on
te_pred_test = te_pred.drop(columns='player')
te_pred_test.head()

Unnamed: 0,team,opp_home,opp,week
0,KAN,1,LVR,10
1,BAL,1,MIA,10
2,TAM,1,WAS,10
3,BUF,1,NYJ,10
4,LVR,0,KAN,10


In [25]:
# take note of the shape
te_pred_test.shape

(72, 4)

In [26]:
# dummify the offensive team
te_pred_test = pd.get_dummies(te_pred_test, columns=['team'], drop_first=True)
te_pred_test.head()

Unnamed: 0,opp_home,opp,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CLE,team_DAL,team_DEN,...,team_NOR,team_NWE,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,1,LVR,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,MIA,10,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,WAS,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,NYJ,10,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,KAN,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# dummify the opponent
te_pred_test = pd.get_dummies(te_pred_test, columns=['opp'], drop_first=True)
te_pred_test.head()

Unnamed: 0,opp_home,week,team_ATL,team_BAL,team_BUF,team_CAR,team_CLE,team_DAL,team_DEN,team_DET,...,opp_NOR,opp_NWE,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,1,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,10,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,10,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# now observe the shape and realize we need to get to 64 columns
te_pred_test.shape

(72, 56)

In [29]:
# which columns are we missing
te_pred_test.columns

Index(['opp_home', 'week', 'team_ATL', 'team_BAL', 'team_BUF', 'team_CAR',
       'team_CLE', 'team_DAL', 'team_DEN', 'team_DET', 'team_GNB', 'team_IND',
       'team_JAX', 'team_KAN', 'team_LAC', 'team_LAR', 'team_LVR', 'team_MIA',
       'team_MIN', 'team_NOR', 'team_NWE', 'team_NYJ', 'team_PHI', 'team_PIT',
       'team_SEA', 'team_SFO', 'team_TAM', 'team_TEN', 'team_WAS', 'opp_ATL',
       'opp_BAL', 'opp_BUF', 'opp_CAR', 'opp_CLE', 'opp_DAL', 'opp_DEN',
       'opp_DET', 'opp_GNB', 'opp_IND', 'opp_JAX', 'opp_KAN', 'opp_LAC',
       'opp_LAR', 'opp_LVR', 'opp_MIA', 'opp_MIN', 'opp_NOR', 'opp_NWE',
       'opp_NYJ', 'opp_PHI', 'opp_PIT', 'opp_SEA', 'opp_SFO', 'opp_TAM',
       'opp_TEN', 'opp_WAS'],
      dtype='object')

In [30]:
# of course, we are missing the teams that are on bye week
te_pred_test['team_CHI'] = 0
te_pred_test['opp_CHI'] = 0
te_pred_test['team_CIN'] = 0
te_pred_test['opp_CIN'] = 0
te_pred_test['team_HOU'] = 0
te_pred_test['opp_HOU'] = 0
te_pred_test['team_NYG'] = 0
te_pred_test['opp_NYG'] = 0

#### Apply predictions to the Player Matchups for Week 10
---

In [31]:
# add columns for all of the predictions of interest (best performing models) if we can even say that...
te_pred['pass_preds_ridgeCV'] = ridge_cv.predict(te_pred_test)
te_pred['pass_preds_knr_gs'] = knr_gridsearch.predict(te_pred_test)
te_pred['pass_preds_rfr'] = rfr.predict(te_pred_test)
te_pred['pass_preds_rfr_gs'] = gs.predict(te_pred_test)

In [32]:
te_pred.sort_values(by='pass_preds_rfr', ascending=False).head(10)

Unnamed: 0,player,team,opp_home,opp,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
7,Dalton Schultz,DAL,0,ATL,10,8.235235,6.565854,18.963433,9.563833
29,Hayden Hurst,ATL,1,DAL,10,8.609348,7.482927,11.452226,8.006554
46,Lee Smith,ATL,1,DAL,10,8.609348,7.482927,11.452226,8.006554
6,Kyle Pitts,ATL,1,DAL,10,8.609348,7.482927,11.452226,8.006554
26,Jonnu Smith,NWE,0,CLE,10,7.097031,6.534146,10.802133,5.901299
11,Hunter Henry,NWE,0,CLE,10,7.097031,6.534146,10.802133,5.901299
54,Josh Oliver,BAL,1,MIA,10,7.169572,6.914634,10.424217,6.545831
1,Mark Andrews,BAL,1,MIA,10,7.169572,6.914634,10.424217,6.545831
68,Eric Tomlinson,BAL,1,MIA,10,7.169572,6.914634,10.424217,6.545831
24,Juwan Johnson,NOR,1,TEN,10,6.878244,6.129268,10.23145,9.931758


---
#### Remember that the top 5 worst defenses by DK points allowed are PHI > BAL > LAC > LAR > IND > HOU > KAN 
note: LAR played Monday night of which we do not have data and HOU is on bye in week 10. 

#### RidgeCV has 2 of those top 5 in its top 10 for tight end points
#### KNNR GS has 3 of those top 5 in its top 10 for tight end points 
#### RndFor has 2 of those top 5 in its top 10 for tight end points (did predict IND as 2nd although they rank 3rd best against rushing)
#### RF GS has 1 of those top 5 in its top 10 for tight end points (did predict BUF as 1st although they rank the best against rushing and IND as 2nd although they rank 3rd best against rushing)

#### If we are using the null model as a baseline like above, then we should consider KNNR GS as our best model to compare with the actual results from week 10 as well as RidgeCV or Random Forest.
---

#### Load in Week 10 player results and null model to compare predictions against
---

In [33]:
# read in week 10 TE results and simplify columns
te_results = pd.read_csv('../data/week_10_TE_results.csv', index_col=[0])
te_results = te_results[['player','team','opp_home','opp','DK_pt']]
te_results.head()

Unnamed: 0,player,team,opp_home,opp,DK_pt
0,Travis Kelce,KAN,1.0,LVR,22.9
1,Hunter Henry,NWE,0.0,CLE,19.7
2,Mark Andrews,BAL,1.0,MIA,18.3
3,Tyler Conklin,MIN,1.0,LAC,16.1
4,Gerald Everett,SEA,1.0,GNB,14.3


In [34]:
# read in null model (season average) versus TE data set and simplify columns
te_null = pd.read_csv('../data/DEF_TE.csv', index_col=[0])
te_null = te_null[['opp','DK_ptg']]
te_null.head()

Unnamed: 0,opp,DK_ptg
0,PHI,19.6
3,BAL,19.2
6,LAC,17.1
5,LAR,17.1
4,IND,16.9


In [35]:
# merge the data frames on opponent and only view the following columns
te_final = te_results.merge(te_null, on='opp', how='outer')
te_final = pd.DataFrame(data=te_final, columns=['player','team','DK_pt','opp_home','opp','DK_ptg'])
te_final.head()

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg
0,Travis Kelce,KAN,22.9,1.0,LVR,14.4
1,Noah Gray,KAN,7.1,1.0,LVR,14.4
2,Blake Bell,KAN,0.3,1.0,LVR,14.4
3,Hunter Henry,NWE,19.7,0.0,CLE,11.0
4,Mark Andrews,BAL,18.3,1.0,MIA,14.1


In [36]:
# now merge with the predictions on the common columns between them
te_final = te_final.merge(te_pred, on=['player','team','opp_home','opp'], how='outer')
te_final.head()

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
0,Travis Kelce,KAN,22.9,1.0,LVR,14.4,10.0,6.798875,7.217073,7.80829,6.350799
1,Noah Gray,KAN,7.1,1.0,LVR,14.4,10.0,6.798875,7.217073,7.80829,6.350799
2,Blake Bell,KAN,0.3,1.0,LVR,14.4,10.0,6.798875,7.217073,7.80829,6.350799
3,Hunter Henry,NWE,19.7,0.0,CLE,11.0,10.0,7.097031,6.534146,10.802133,5.901299
4,Mark Andrews,BAL,18.3,1.0,MIA,14.1,10.0,7.169572,6.914634,10.424217,6.545831


In [38]:
# view actual week 10 results with the model predictions
te_final.sort_values(by='DK_pt', ascending=False).head(20)

Unnamed: 0,player,team,DK_pt,opp_home,opp,DK_ptg,week,pass_preds_ridgeCV,pass_preds_knr_gs,pass_preds_rfr,pass_preds_rfr_gs
0,Travis Kelce,KAN,22.9,1.0,LVR,14.4,10.0,6.798875,7.217073,7.80829,6.350799
3,Hunter Henry,NWE,19.7,0.0,CLE,11.0,10.0,7.097031,6.534146,10.802133,5.901299
4,Mark Andrews,BAL,18.3,1.0,MIA,14.1,10.0,7.169572,6.914634,10.424217,6.545831
6,Tyler Conklin,MIN,16.1,1.0,LAC,17.1,10.0,6.101604,6.695122,7.070788,5.898455
8,Gerald Everett,SEA,14.3,1.0,GNB,13.2,10.0,5.685758,6.565854,5.386317,5.796982
10,Dan Arnold,JAX,13.7,1.0,IND,16.9,10.0,5.500504,6.734146,4.984205,5.773026
11,Austin Hooper,CLE,12.5,1.0,NWE,7.5,10.0,6.587542,6.531707,4.633262,5.86623
14,Noah Fant,DEN,10.9,0.0,PHI,19.6,10.0,8.061688,7.068293,8.702833,6.042574
15,Albert Okwuegbunam,DEN,10.7,0.0,PHI,19.6,10.0,8.061688,7.068293,8.702833,6.042574
16,Kyle Pitts,ATL,10.0,1.0,DAL,14.1,10.0,8.609348,7.482927,11.452226,8.006554


---
#### Looking at the top 10 TE points for week 10, we see that 3 of the top 5 null model defensive points allowed made the top 10 actual scores with 2 players against PHI (most allowed TE points). The top actual score was against a defense that allowed the 11th most points to TEs. Let's list out our top 5 by our top models and see where it ranked with actual TE point totals.

|KNNR GS||Random Forest||
|---|---|---|---|
|Pred Opp|Actual Opp|Pred Opp|Actual Opp|
|JAX - 7.7|20th - 6.1|ATL - 18.8|27th - 3.5| 
|DAL - 7.5|10th - 10.0|CLE - 11.0|2nd - 19.7|
|LVR - 7.2|1st - 22.9|TEN - 10.6|12th - 8.2|
|PHI - 7.1|8th - 10.9|MIA - 10.5|3rd - 18.3|
|WAS - 6.9|13th - 7.6|DAL - 10.3|10th - 10.0|

#### To be fair DEN utilized 2 TEs that totaled 21.6 points which would have been good for 2nd best and was 1 point greater than PHI's league best 19.6 points allowed to Tight Ends on average. Both models picked top scores with the KNNR GS picking the best performer 3rd and the Random Forest predicting the 2nd and 3rd best performers in its top 5 (2nd for 2nd and 3rd for 4th). The reality is the 15th best actual RB did play against BUF (1st against rush) and the 14th best actual RB did play against IND (3rd against rush).
---