## Imports
---

In [253]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_val_predict

#### Load in the game data for every matchup in weeks 1 through 9 in order to make predictions about team scoring (passing and rushing) for week 10.
---

In [219]:
# read in data set
game = pd.read_csv('../data/team_game.csv', index_col=[0])

In [185]:
game

Unnamed: 0,team,opp_home,opp,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,...,third_cmp,third_pct,fourth_att,fourth_cmp,fourth_pct,off_result,off_score,opp_score,pass_pts,rush_pts
0,KAN,1.0,PHI,4,3,0.0,24,30,80.0,271,...,9,0.900,0,0,0.000,1,42,30,29.84,26.0
1,NOR,0.0,GNB,1,4,0.0,15,21,71.4,151,...,5,0.500,2,2,1.000,1,38,3,26.04,17.1
2,TAM,0.0,ATL,2,4,0.0,24,36,66.7,259,...,4,0.333,2,1,0.500,1,48,25,30.36,8.2
3,TAM,0.0,MIA,5,3,0.0,33,44,75.0,437,...,8,0.727,0,0,0.000,1,45,17,37.48,18.1
4,ATL,0.0,WAS,4,3,0.0,25,42,59.5,275,...,10,0.625,0,0,0.000,0,30,34,27.00,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,LVR,0.0,CHI,5,4,0.0,22,35,62.9,188,...,5,0.357,3,1,0.333,0,9,20,6.52,13.1
268,SFO,1.0,ARI,5,4,0.0,15,29,51.7,186,...,3,0.273,5,1,0.200,0,10,17,6.44,21.2
269,SFO,1.0,CHI,8,3,0.0,17,28,60.7,322,...,4,0.400,1,1,1.000,1,33,22,12.88,32.5
270,TAM,1.0,NWE,4,5,0.0,22,43,51.2,261,...,9,0.474,0,0,0.000,1,19,17,10.44,18.0


#### Let's try modeling the data without considering the team or opponent
---

In [186]:
# First, let's predict the amount of passing points
X = game.drop(columns=['team','opp_home','opp','pass_pts'])
y = game['pass_pts']

In [187]:
X.shape, y.shape

((272, 35), (272,))

In [188]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [189]:
# set up linear regression, fit, and score the data
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(1.0, 1.0)

#### These scores are perfect, so the linear model can easily predict the correct amount of passing points

In [190]:
# set up Random Forest Regression model, fit and score the data
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.9948699983728918, 0.9711038496292641)

#### These scores are still extremely good and perhaps more realistic. Let's look at both against the true values.

In [191]:
# add new columns to the data frame with our LinReg and Random Forest predictions
game['pass_preds_lr'] = lr.predict(X)
game['pass_preds_rfr'] = rfr.predict(X)
game.sort_values(by='pass_pts', ascending=False)

Unnamed: 0,team,opp_home,opp,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,...,fourth_att,fourth_cmp,fourth_pct,off_result,off_score,opp_score,pass_pts,rush_pts,pass_preds_lr,pass_preds_rfr
3,TAM,0.0,MIA,5,3,0.0,33,44,75.0,437,...,0,0,0.000,1,45,17,37.48,18.1,37.48,35.1276
16,BAL,0.0,IND,5,6,1.0,37,43,86.0,437,...,0,0,0.000,1,31,25,33.48,8.6,33.48,29.4784
18,LAC,0.0,CLE,5,4,0.0,26,43,60.5,381,...,3,3,1.000,1,47,42,31.24,29.2,31.24,29.2012
13,NYJ,1.0,IND,9,1,0.0,34,52,65.4,398,...,1,1,1.000,0,30,45,30.92,8.8,30.92,30.3644
2,TAM,0.0,ATL,2,4,0.0,24,36,66.7,259,...,2,1,0.500,1,48,25,30.36,8.2,30.36,27.9148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,NYJ,0.0,NWE,2,3,0.0,19,33,57.6,184,...,3,1,0.333,0,6,25,3.36,15.2,3.36,4.2060
260,NYJ,1.0,DEN,3,4,0.0,19,35,54.3,119,...,2,0,0.000,0,0,26,2.76,4.3,2.76,2.9656
254,NOR,1.0,CAR,2,3,0.0,11,22,50.0,80,...,2,0,0.000,0,7,26,1.20,10.8,1.20,2.1184
227,CHI,1.0,CLE,3,3,0.0,6,20,30.0,1,...,1,0,0.000,0,6,26,0.04,4.6,0.04,2.6112


In [192]:
# looking at just passing points and predictions
print(game[['pass_pts','pass_preds_lr','pass_preds_rfr']])

     pass_pts  pass_preds_lr  pass_preds_rfr
0       29.84          29.84         28.7976
1       26.04          26.04         25.6988
2       30.36          30.36         27.9148
3       37.48          37.48         35.1276
4       27.00          27.00         26.7068
..        ...            ...             ...
267      6.52           6.52          6.4804
268      6.44           6.44          6.5176
269     12.88          12.88         11.3472
270     10.44          10.44         10.5528
271      7.68           7.68          8.3868

[272 rows x 3 columns]


#### Now let's predict the rushing points

In [193]:
# Next, let's predict the amount of rushing points
X = game.drop(columns=['team','opp_home','opp','rush_pts','pass_preds_lr','pass_preds_rfr'])
y = game['rush_pts']

In [194]:
X.shape, y.shape

((272, 35), (272,))

In [195]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [196]:
# set up linear regression, fit, and score the data
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(1.0, 1.0)

#### These scores are perfect, so the linear model can easily predict the correct amount of rushing points

In [197]:
# set up Random Forest Regression model, fit and score the data
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.9963956518850525, 0.9766160432533433)

#### Again, these scores are still extremely good and offer a slight improvement above Random Forest predictions for the models that did not consider teams. Let's look at both against the true values.

In [198]:
# add new columns to the data frame with our LinReg and Random Forest predictions
game['rush_preds_lr'] = lr.predict(X)
game['rush_preds_rfr'] = rfr.predict(X)
game.sort_values(by='rush_pts', ascending=False)

Unnamed: 0,team,opp_home,opp,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,...,fourth_pct,off_result,off_score,opp_score,pass_pts,rush_pts,pass_preds_lr,pass_preds_rfr,rush_preds_lr,rush_preds_rfr
266,PHI,1.0,DET,8,3,0.0,11,16,68.8,114,...,0.000,1,44,6,4.56,47.6,4.56,5.5768,47.6,43.617
29,IND,0.0,NYJ,9,1,0.0,22,30,73.3,272,...,0.500,1,45,30,22.88,44.0,22.88,23.6520,44.0,42.328
202,BAL,0.0,KAN,2,5,0.0,18,26,69.2,230,...,1.000,1,36,35,11.20,43.1,11.20,11.6564,43.1,41.883
73,CLE,1.0,LAC,5,4,0.0,23,32,71.9,301,...,0.333,0,42,47,20.04,41.0,20.04,20.0380,41.0,40.454
231,CLE,1.0,KAN,1,4,0.0,21,28,75.0,304,...,0.750,0,29,33,11.16,39.3,11.16,11.1604,39.3,37.337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,SFO,0.0,ARI,9,4,0.0,28,40,70.0,298,...,0.000,0,17,31,18.92,3.9,18.92,19.0088,3.9,3.909
101,MIA,1.0,TAM,5,3,0.0,27,39,69.2,262,...,0.000,0,17,45,17.48,3.9,17.48,16.3504,3.9,3.521
100,MIA,0.0,IND,4,3,0.0,20,30,66.7,168,...,1.000,0,17,27,14.72,3.5,14.72,14.4952,3.5,3.708
209,LAC,1.0,BAL,6,3,0.0,22,39,56.4,182,...,0.250,0,6,34,10.28,2.6,10.28,10.0112,2.6,4.023


In [199]:
# looking at just the passing and rushing points with their predictions
print(game[['pass_pts','pass_preds_lr','pass_preds_rfr']])
print(game[['rush_pts','rush_preds_lr','rush_preds_rfr']])

     pass_pts  pass_preds_lr  pass_preds_rfr
0       29.84          29.84         28.7976
1       26.04          26.04         25.6988
2       30.36          30.36         27.9148
3       37.48          37.48         35.1276
4       27.00          27.00         26.7068
..        ...            ...             ...
267      6.52           6.52          6.4804
268      6.44           6.44          6.5176
269     12.88          12.88         11.3472
270     10.44          10.44         10.5528
271      7.68           7.68          8.3868

[272 rows x 3 columns]
     rush_pts  rush_preds_lr  rush_preds_rfr
0        26.0           26.0          25.789
1        17.1           17.1          17.603
2         8.2            8.2           8.150
3        18.1           18.1          18.081
4         9.9            9.9           9.908
..        ...            ...             ...
267      13.1           13.1          13.172
268      21.2           21.2          21.242
269      32.5           32.5   

---
#### What did we actually accomplish? Without the use of knowing what teams are playing, we cannot make any predictions against future games. We need to add the teams in each matchup as features and rerun the model.
---

In [200]:
# reload the data set
game = pd.read_csv('../data/team_game.csv', index_col=[0])

In [201]:
game

Unnamed: 0,team,opp_home,opp,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,...,third_cmp,third_pct,fourth_att,fourth_cmp,fourth_pct,off_result,off_score,opp_score,pass_pts,rush_pts
0,KAN,1.0,PHI,4,3,0.0,24,30,80.0,271,...,9,0.900,0,0,0.000,1,42,30,29.84,26.0
1,NOR,0.0,GNB,1,4,0.0,15,21,71.4,151,...,5,0.500,2,2,1.000,1,38,3,26.04,17.1
2,TAM,0.0,ATL,2,4,0.0,24,36,66.7,259,...,4,0.333,2,1,0.500,1,48,25,30.36,8.2
3,TAM,0.0,MIA,5,3,0.0,33,44,75.0,437,...,8,0.727,0,0,0.000,1,45,17,37.48,18.1
4,ATL,0.0,WAS,4,3,0.0,25,42,59.5,275,...,10,0.625,0,0,0.000,0,30,34,27.00,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,LVR,0.0,CHI,5,4,0.0,22,35,62.9,188,...,5,0.357,3,1,0.333,0,9,20,6.52,13.1
268,SFO,1.0,ARI,5,4,0.0,15,29,51.7,186,...,3,0.273,5,1,0.200,0,10,17,6.44,21.2
269,SFO,1.0,CHI,8,3,0.0,17,28,60.7,322,...,4,0.400,1,1,1.000,1,33,22,12.88,32.5
270,TAM,1.0,NWE,4,5,0.0,22,43,51.2,261,...,9,0.474,0,0,0.000,1,19,17,10.44,18.0


In [202]:
# dummify the opponent list of teams
game = pd.get_dummies(game, columns=['opp'], drop_first=True)
game

Unnamed: 0,team,opp_home,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,pass_td,...,opp_NWE,opp_NYG,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,KAN,1.0,4,3,0.0,24,30,80.0,271,5,...,0,0,0,1,0,0,0,0,0,0
1,NOR,0.0,1,4,0.0,15,21,71.4,151,5,...,0,0,0,0,0,0,0,0,0,0
2,TAM,0.0,2,4,0.0,24,36,66.7,259,5,...,0,0,0,0,0,0,0,0,0,0
3,TAM,0.0,5,3,0.0,33,44,75.0,437,5,...,0,0,0,0,0,0,0,0,0,0
4,ATL,0.0,4,3,0.0,25,42,59.5,275,4,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,LVR,0.0,5,4,0.0,22,35,62.9,188,0,...,0,0,0,0,0,0,0,0,0,0
268,SFO,1.0,5,4,0.0,15,29,51.7,186,0,...,0,0,0,0,0,0,0,0,0,0
269,SFO,1.0,8,3,0.0,17,28,60.7,322,0,...,0,0,0,0,0,0,0,0,0,0
270,TAM,1.0,4,5,0.0,22,43,51.2,261,0,...,1,0,0,0,0,0,0,0,0,0


In [203]:
# dummify the offensive list of teams
game = pd.get_dummies(game, columns=['team'], drop_first=True)
game

Unnamed: 0,opp_home,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,pass_td,pass_int,...,team_NWE,team_NYG,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,1.0,4,3,0.0,24,30,80.0,271,5,1,...,0,0,0,0,0,0,0,0,0,0
1,0.0,1,4,0.0,15,21,71.4,151,5,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,2,4,0.0,24,36,66.7,259,5,0,...,0,0,0,0,0,0,0,1,0,0
3,0.0,5,3,0.0,33,44,75.0,437,5,0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,4,3,0.0,25,42,59.5,275,4,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,0.0,5,4,0.0,22,35,62.9,188,0,1,...,0,0,0,0,0,0,0,0,0,0
268,1.0,5,4,0.0,15,29,51.7,186,0,1,...,0,0,0,0,0,0,1,0,0,0
269,1.0,8,3,0.0,17,28,60.7,322,0,0,...,0,0,0,0,0,0,1,0,0,0
270,1.0,4,5,0.0,22,43,51.2,261,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Ok, so now all team information for each game is considered. Let's model!
---

In [204]:
# First, let's predict the amount of passing points
X = game.drop(columns=['pass_pts'])
y = game['pass_pts']

In [205]:
X.shape, y.shape

((272, 98), (272,))

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [207]:
# set up linear regression, fit, and score the data
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(1.0, 1.0)

#### Well... these scores are still perfect, so the linear model can easily predict the correct amount of passing points when considering all features in the data set

In [208]:
# set up Random Forest Regression model, fit and score the data
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.9945701337291145, 0.9692831695322658)

#### Compared to the non-dummy data (0.9949, 0.9711) these are nearly the same. Let's look at both against the true values.

In [209]:
# add new columns to the data frame with our LinReg and Random Forest predictions
game['pass_preds_lr'] = lr.predict(X)
game['pass_preds_rfr'] = rfr.predict(X)
game.sort_values(by='pass_pts', ascending=False)

Unnamed: 0,opp_home,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,pass_td,pass_int,...,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS,pass_preds_lr,pass_preds_rfr
3,0.0,5,3,0.0,33,44,75.0,437,5,0,...,0,0,0,0,0,1,0,0,37.48,34.5408
16,0.0,5,6,1.0,37,43,86.0,437,4,0,...,0,0,0,0,0,0,0,0,33.48,29.3380
18,0.0,5,4,0.0,26,43,60.5,381,4,0,...,0,0,0,0,0,0,0,0,31.24,28.6124
13,1.0,9,1,0.0,34,52,65.4,398,4,1,...,1,0,0,0,0,0,0,0,30.92,29.7564
2,0.0,2,4,0.0,24,36,66.7,259,5,0,...,0,0,0,0,0,1,0,0,30.36,28.1280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,0.0,2,3,0.0,19,33,57.6,184,0,4,...,1,0,0,0,0,0,0,0,3.36,3.9880
260,1.0,3,4,0.0,19,35,54.3,119,0,2,...,1,0,0,0,0,0,0,0,2.76,3.1716
254,1.0,2,3,0.0,11,22,50.0,80,0,2,...,0,0,0,0,0,0,0,0,1.20,2.5924
227,1.0,3,3,0.0,6,20,30.0,1,0,0,...,0,0,0,0,0,0,0,0,0.04,2.8284


In [210]:
# looking at just passing points and predictions
print(game[['pass_pts','pass_preds_lr','pass_preds_rfr']])

     pass_pts  pass_preds_lr  pass_preds_rfr
0       29.84          29.84         28.9032
1       26.04          26.04         25.6980
2       30.36          30.36         28.1280
3       37.48          37.48         34.5408
4       27.00          27.00         26.5320
..        ...            ...             ...
267      6.52           6.52          6.5952
268      6.44           6.44          6.5600
269     12.88          12.88         11.7088
270     10.44          10.44         10.5376
271      7.68           7.68          8.6004

[272 rows x 3 columns]


#### Now let's predict the rushing points

In [212]:
# Next, let's predict the amount of rushing points
X = game.drop(columns=['rush_pts','pass_preds_lr','pass_preds_rfr'])
y = game['rush_pts']

In [213]:
X.shape, y.shape

((272, 98), (272,))

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [215]:
# set up linear regression, fit, and score the data
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(1.0, 1.0)

#### These scores are perfect, so the linear model can easily predict the correct amount of rushing points

In [216]:
# set up Random Forest Regression model, fit and score the data
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.9951964456837212, 0.9704184626521566)

#### Compared to the non-dummy data (0.9964, 0.9766) these are nearly the same. Let's look at both against the true values.

In [217]:
# add new columns to the data frame with our LinReg and Random Forest predictions
game['rush_preds_lr'] = lr.predict(X)
game['rush_preds_rfr'] = rfr.predict(X)
game.sort_values(by='rush_pts', ascending=False)

Unnamed: 0,opp_home,week,slot,ot,pass_cmp,pass_att,pass_cmp_pct,pass_yds,pass_td,pass_int,...,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS,pass_preds_lr,pass_preds_rfr,rush_preds_lr,rush_preds_rfr
266,1.0,8,3,0.0,11,16,68.8,114,0,0,...,0,0,0,0,0,0,4.56,5.4608,47.6,43.190
29,0.0,9,1,0.0,22,30,73.3,272,3,0,...,0,0,0,0,0,0,22.88,23.1252,44.0,40.928
202,0.0,2,5,0.0,18,26,69.2,230,1,2,...,0,0,0,0,0,0,11.20,11.5524,43.1,41.515
73,1.0,5,4,0.0,23,32,71.9,301,2,0,...,0,0,0,0,0,0,20.04,20.0516,41.0,39.622
231,1.0,1,4,0.0,21,28,75.0,304,0,1,...,0,0,0,0,0,0,11.16,11.5016,39.3,37.706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,0.0,9,4,0.0,28,40,70.0,298,2,1,...,0,0,1,0,0,0,18.92,18.9824,3.9,4.010
101,1.0,5,3,0.0,27,39,69.2,262,2,1,...,0,0,0,0,0,0,17.48,16.5552,3.9,3.297
100,0.0,4,3,0.0,20,30,66.7,168,2,0,...,0,0,0,0,0,0,14.72,14.4484,3.5,3.740
209,1.0,6,3,0.0,22,39,56.4,182,1,1,...,0,0,0,0,0,0,10.28,9.8896,2.6,4.092


In [218]:
# looking at just the passing and rushing points with their predictions
print(game[['pass_pts','pass_preds_lr','pass_preds_rfr']])
print(game[['rush_pts','rush_preds_lr','rush_preds_rfr']])

     pass_pts  pass_preds_lr  pass_preds_rfr
0       29.84          29.84         28.9032
1       26.04          26.04         25.6980
2       30.36          30.36         28.1280
3       37.48          37.48         34.5408
4       27.00          27.00         26.5320
..        ...            ...             ...
267      6.52           6.52          6.5952
268      6.44           6.44          6.5600
269     12.88          12.88         11.7088
270     10.44          10.44         10.5376
271      7.68           7.68          8.6004

[272 rows x 3 columns]
     rush_pts  rush_preds_lr  rush_preds_rfr
0        26.0           26.0          25.645
1        17.1           17.1          17.450
2         8.2            8.2           8.161
3        18.1           18.1          18.101
4         9.9            9.9           9.949
..        ...            ...             ...
267      13.1           13.1          13.097
268      21.2           21.2          21.280
269      32.5           32.5   

---
#### At this point the models do well against against familiar data, but we can't use any of these models on future data because we won't know any of the values of the features used in this data set to predict the amount of points scored by passing or rushing. In order to predict points scored by passing and rushing we can only use features that are available to us prior to the games being played in week 10.
---

In [320]:
# reload the data set and only use features which we can predict against for week 10
game = pd.read_csv('../data/team_game.csv', index_col=[0])

In [321]:
# shrink down the data set before creating dummy columns
game = game[['team','opp_home','opp','week','slot','pass_pts','rush_pts']]
game

Unnamed: 0,team,opp_home,opp,week,slot,pass_pts,rush_pts
0,KAN,1.0,PHI,4,3,29.84,26.0
1,NOR,0.0,GNB,1,4,26.04,17.1
2,TAM,0.0,ATL,2,4,30.36,8.2
3,TAM,0.0,MIA,5,3,37.48,18.1
4,ATL,0.0,WAS,4,3,27.00,9.9
...,...,...,...,...,...,...,...
267,LVR,0.0,CHI,5,4,6.52,13.1
268,SFO,1.0,ARI,5,4,6.44,21.2
269,SFO,1.0,CHI,8,3,12.88,32.5
270,TAM,1.0,NWE,4,5,10.44,18.0


In [322]:
# dummify the opponent list of teams
game = pd.get_dummies(game, columns=['opp'], drop_first=True)
game

Unnamed: 0,team,opp_home,week,slot,pass_pts,rush_pts,opp_ATL,opp_BAL,opp_BUF,opp_CAR,...,opp_NWE,opp_NYG,opp_NYJ,opp_PHI,opp_PIT,opp_SEA,opp_SFO,opp_TAM,opp_TEN,opp_WAS
0,KAN,1.0,4,3,29.84,26.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,NOR,0.0,1,4,26.04,17.1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TAM,0.0,2,4,30.36,8.2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TAM,0.0,5,3,37.48,18.1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ATL,0.0,4,3,27.00,9.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,LVR,0.0,5,4,6.52,13.1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
268,SFO,1.0,5,4,6.44,21.2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
269,SFO,1.0,8,3,12.88,32.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
270,TAM,1.0,4,5,10.44,18.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [323]:
# dummify the offensive list of teams
game = pd.get_dummies(game, columns=['team'], drop_first=True)
game

Unnamed: 0,opp_home,week,slot,pass_pts,rush_pts,opp_ATL,opp_BAL,opp_BUF,opp_CAR,opp_CHI,...,team_NWE,team_NYG,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS
0,1.0,4,3,29.84,26.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,1,4,26.04,17.1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,2,4,30.36,8.2,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.0,5,3,37.48,18.1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,4,3,27.00,9.9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,0.0,5,4,6.52,13.1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
268,1.0,5,4,6.44,21.2,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
269,1.0,8,3,12.88,32.5,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
270,1.0,4,5,10.44,18.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Ok, so now we are only consider the matchup information for each game. Let's model!
---

In [324]:
# First, let's predict the amount of passing points
X = game.drop(columns=['pass_pts'])
y = game['pass_pts']

In [325]:
X.shape, y.shape

((272, 66), (272,))

In [326]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [327]:
# set up linear regression, fit, and score the data
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.4626442720632503, -0.47446773230695727)

#### Well... these scores are kinda the opposite of perfect. The data is extremely overfit based on the terrible testing score.

In [328]:
# set up Random Forest Regression model, fit and score the data
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.8659663351689698, 0.05440138505767533)

#### The Random Forest handles the model much better on the training data but is still supremely overfit based on the poor testing score.

In [329]:
# add new columns to the data frame with our LinReg and Random Forest predictions
game['pass_preds_lr'] = lr.predict(X)
game['pass_preds_rfr'] = rfr.predict(X)
game.sort_values(by='pass_pts', ascending=False)

Unnamed: 0,opp_home,week,slot,pass_pts,rush_pts,opp_ATL,opp_BAL,opp_BUF,opp_CAR,opp_CHI,...,team_NYJ,team_PHI,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS,pass_preds_lr,pass_preds_rfr
3,0.0,5,3,37.48,18.1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,28.462786,33.0308
16,0.0,5,6,33.48,8.6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,13.576774,13.3996
18,0.0,5,4,31.24,29.2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,17.051340,15.6716
13,1.0,9,1,30.92,8.8,0,0,0,0,0,...,1,0,0,0,0,0,0,0,16.262576,23.2228
2,0.0,2,4,30.36,8.2,1,0,0,0,0,...,0,0,0,0,0,1,0,0,28.924387,27.3552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,0.0,2,3,3.36,15.2,0,0,0,0,0,...,1,0,0,0,0,0,0,0,9.656323,8.8632
260,1.0,3,4,2.76,4.3,0,0,0,0,0,...,1,0,0,0,0,0,0,0,9.636458,6.2656
254,1.0,2,3,1.20,10.8,0,0,0,1,0,...,0,0,0,0,0,0,0,0,8.157484,6.0992
227,1.0,3,3,0.04,4.6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8.909525,9.1692


In [330]:
# looking at just passing points and predictions
print(game[['pass_pts','pass_preds_lr','pass_preds_rfr']])

     pass_pts  pass_preds_lr  pass_preds_rfr
0       29.84      19.525550         24.6640
1       26.04      15.137735         22.7644
2       30.36      28.924387         27.3552
3       37.48      28.462786         33.0308
4       27.00      21.219335         24.1656
..        ...            ...             ...
267      6.52      20.946536         19.4620
268      6.44      13.955630          9.4576
269     12.88      16.660505         12.8852
270     10.44      21.119415         16.2072
271      7.68      12.693196          9.9276

[272 rows x 3 columns]


#### The models have a hard time making accurate predictions on data it has seen let alone unseen week 10 data. Let's check the rushing data predictions.

In [331]:
# Next, let's predict the amount of rushing points
X = game.drop(columns=['rush_pts','pass_preds_lr','pass_preds_rfr'])
y = game['rush_pts']

In [332]:
X.shape, y.shape

((272, 66), (272,))

In [333]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [334]:
# set up linear regression, fit, and score the data
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.4567707434841428, -0.6379892565510219)

#### These scores are similar to the passing modeling and a bit worse on the testing data (0.4626, -0.4744).

In [335]:
# set up Random Forest Regression model, fit and score the data
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train), rfr.score(X_test, y_test)

(0.8530839107373711, -0.03917425133165042)

#### Again, the scores for Random Forest are also slightly worse than for the passing model (0.8659, 0.0544). Let's look at both against the true values.

In [336]:
# add new columns to the data frame with our LinReg and Random Forest predictions
game['rush_preds_lr'] = lr.predict(X)
game['rush_preds_rfr'] = rfr.predict(X)
game.sort_values(by='rush_pts', ascending=False)

Unnamed: 0,opp_home,week,slot,pass_pts,rush_pts,opp_ATL,opp_BAL,opp_BUF,opp_CAR,opp_CHI,...,team_PIT,team_SEA,team_SFO,team_TAM,team_TEN,team_WAS,pass_preds_lr,pass_preds_rfr,rush_preds_lr,rush_preds_rfr
266,1.0,8,3,4.56,47.6,0,0,0,0,0,...,0,0,0,0,0,0,9.730880,8.6204,29.430654,37.295
29,0.0,9,1,22.88,44.0,0,0,0,0,0,...,0,0,0,0,0,0,18.092542,20.3592,27.814836,32.431
202,0.0,2,5,11.20,43.1,0,0,0,0,0,...,0,0,0,0,0,0,12.125959,13.3496,24.532053,32.852
73,1.0,5,4,20.04,41.0,0,0,0,0,0,...,0,0,0,0,0,0,5.055215,11.8748,30.222940,21.389
231,1.0,1,4,11.16,39.3,0,0,0,0,0,...,0,0,0,0,0,0,11.992223,13.1360,26.165817,32.841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,0.0,9,4,18.92,3.9,0,0,0,0,0,...,0,0,1,0,0,0,15.697071,15.9004,13.416340,8.309
101,1.0,5,3,17.48,3.9,0,0,0,0,0,...,0,0,0,0,0,0,16.652115,15.3968,3.760402,10.979
100,0.0,4,3,14.72,3.5,0,0,0,0,0,...,0,0,0,0,0,0,16.092467,14.4860,4.128556,6.609
209,1.0,6,3,10.28,2.6,0,1,0,0,0,...,0,0,0,0,0,0,25.784086,19.5004,24.395074,14.624


In [337]:
# looking at just the passing and rushing points with their predictions
print(game[['pass_pts','pass_preds_lr','pass_preds_rfr']])
print(game[['rush_pts','rush_preds_lr','rush_preds_rfr']])

     pass_pts  pass_preds_lr  pass_preds_rfr
0       29.84      19.525550         24.6640
1       26.04      15.137735         22.7644
2       30.36      28.924387         27.3552
3       37.48      28.462786         33.0308
4       27.00      21.219335         24.1656
..        ...            ...             ...
267      6.52      20.946536         19.4620
268      6.44      13.955630          9.4576
269     12.88      16.660505         12.8852
270     10.44      21.119415         16.2072
271      7.68      12.693196          9.9276

[272 rows x 3 columns]
     rush_pts  rush_preds_lr  rush_preds_rfr
0        26.0      16.030684          21.060
1        17.1      15.703684          16.052
2         8.2      14.403928          11.717
3        18.1      18.881559          16.671
4         9.9      11.275007          11.794
..        ...            ...             ...
267      13.1      13.860225          14.492
268      21.2      15.449295          18.692
269      32.5      20.633686   

---
#### These predictions are very poor against seen data. At this point, it would be best to move on to predicting player data which is our main goal. We will apply week 10 predictions against player data.
---