In [68]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

from joblib import dump, load

import numpy as np

In [144]:
xl1 = pd.read_csv('data_cleaned/modelSource.csv', index_col=0)

In [145]:
# Account for the way WRS are set in slot, left, and right
xl1['posRank'] = xl1['posRank'].map({
    'QB1':'QB1', 
    'QB2':'QB2', 
    'QB3':'QB3', 
    'QB4':'QB4',
    'QB5':'QB5', 
    'RB1':'RB1', 
    'RB2':'RB2', 
    'RB3':'RB3', 
    'RB4':'RB4', 
    'RB5':'RB5',
    'RB6':'RB5', 
    'RB7':'RB5', 
    'WR1': 'WR1', 
    'WR2': 'WR1', 
    'WR3': 'WR1', 
    'WR4': 'WR2', 
    'WR5': 'WR2', 
    'WR6': 'WR2', 
    'WR7': 'WR3', 
    'WR8': 'WR3',
    'TE1':'TE1', 
    'TE2':'TE2', 
    'TE3':'TE3', 
    'TE4':'TE4', 
    'TE5':'TE5', 
    'PK1':'PK1', 
    'PK2':'PK2', 
    'PK3':'PK3',
    'DF1':'DF1'
    })

In [146]:
# Define features and labels
y = xl1[[
'passA',
'passC',
'passY',
'passT',
'passI',
'pass2',
'rushA',
'rushY',
'rushT',
'rush2',
'recC',
'recY',
'recT',
'rec2',
'fum',
'XPA',
'XPM',
'FGA',
'FGM',
'FG50',
'defSack',
'defI',
'defSaf',
'defFum',
'defBlk',
'defT',
'defPtsAgainst',
'defPassYAgainst',
'defRushYAgainst',
'defYdsAgainst'                                 
]]

X = xl1.drop(columns=[
    'season',
    'team',
    'player',
    'opponent',
    'passA',
    'passC',
    'passY',
    'passT',
    'passI',
    'pass2',
    'rushA',
    'rushY',
    'rushT',
    'rush2',
    'recC',
    'recY',
    'recT',
    'rec2',
    'fum',
    'XPA',
    'XPM',
    'FGA',
    'FGM',
    'FG50',
    'defSack',
    'defI',
    'defSaf',
    'defFum',
    'defBlk',
    'defT',
    'defPtsAgainst',
    'defPassYAgainst',
    'defRushYAgainst',
    'defYdsAgainst'
])

droppedCols = xl1[['season',
'team',
'player',
'opponent'
]]

In [147]:
# Encode categorical features
X = pd.get_dummies(X, columns = ['pos', 'posRank'])
X

Unnamed: 0,week,passA_curr,passC_curr,passY_curr,passT_curr,passI_curr,pass2_curr,rushA_curr,rushY_curr,rushT_curr,...,posRank_RB4,posRank_RB5,posRank_TE1,posRank_TE2,posRank_TE3,posRank_TE4,posRank_TE5,posRank_WR1,posRank_WR2,posRank_WR3
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57014,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
57015,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
57016,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
57017,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [149]:
# Create a baseline prediction using the player's previous year statistics
colList = [(x + "_prior1") for x in list(y.columns)]
base_pred = X_test[colList]

r2 = r2_score(y_test, base_pred)

print("R2: ", r2)
print("R2 Adjusted: ", (1 - (1-r2)*(len(y_test)-1)/(len(y_test)-len(X.columns)-1)))
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, base_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, base_pred))

R2:  0.37475108889716563
R2 Adjusted:  0.36692298585458893
Mean Absolute Error (MAE): 1.892418935461194
Mean Squared Error (MSE): 146.1189186816664


In [22]:
regressor = RandomForestRegressor(n_estimators=70, max_depth=6, random_state=1)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)

print("R2: ", r2)
print("R2 Adjusted: ", (1 - (1-r2)*(len(y_test)-1)/(len(y_test)-len(X.columns)-1)))
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))

R2:  0.40067978787163055
R2 Adjusted:  0.3927989001421771
Mean Absolute Error (MAE): 1.7013323837354124
Mean Squared Error (MSE): 91.76656762983988


In [37]:
regressor = RandomForestRegressor(n_estimators=35, max_depth=12, random_state=1)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)

print("R2: ", r2)
print("R2 Adjusted: ", (1 - (1-r2)*(len(y_test)-1)/(len(y_test)-len(X.columns)-1)))
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))

R2:  0.49592972158374954
R2 Adjusted:  0.48930134297818706
Mean Absolute Error (MAE): 1.499207436581209
Mean Squared Error (MSE): 86.51795773782285


In [38]:
regressor = RandomForestRegressor(n_estimators=150, max_depth=12, random_state=1)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)

print("R2: ", r2)
print("R2 Adjusted: ", (1 - (1-r2)*(len(y_test)-1)/(len(y_test)-len(X.columns)-1)))
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))

R2:  0.49774126596261586
R2 Adjusted:  0.491136708642533
Mean Absolute Error (MAE): 1.4937951194156518
Mean Squared Error (MSE): 85.76441487675659


In [39]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [5,20,50,100] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}


rf_random = RandomizedSearchCV(estimator = regressor,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

rf_random.fit(X_train, y_train)

print ('Random grid: ', random_grid, '\n')
# print the best parameters
print ('Best Parameters: ', rf_random.best_params_, ' \n')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=20; total time=   4.6s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=20; total time=   4.6s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=20; total time=   4.6s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=20; total time=   4.7s
[CV] END bootstrap=True, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=5; total time=   2.7s
[CV] END bootstrap=True, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=5; total time=   2.7s
[CV] END bootstrap=True, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=5; total t

KeyboardInterrupt: 

In [41]:
regressor = RandomForestRegressor(n_estimators=70, max_depth=6, random_state=1)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
regressor.score(X_train, y_train)

0.4089796365804355

In [52]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.4088380661692439
0.40051586384672755


In [53]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=12, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5916580680003702
0.49757878849233045


In [None]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=24, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.8865353555100816
0.4910426854318589


In [54]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=18, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.77360914727663
0.49488156269258476


In [55]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.6824089618202811
0.4968218651007954


In [56]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=9, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5275459363914689
0.49478753712926177


In [57]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=11, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5665687680914301
0.49751379830198234


In [58]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=13, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.620285802616434
0.49765475817884824


In [59]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.6510269770332092
0.49765596105828325


In [60]:
regressor = RandomForestRegressor(min_samples_split=4, n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.6120794780648856
0.4981907287531276


In [61]:
regressor = RandomForestRegressor(min_samples_split=8, n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5844140635195508
0.498332679074216


In [62]:
regressor = RandomForestRegressor(min_samples_split=16, n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5635026875044423
0.4985052341323545


In [63]:
regressor = RandomForestRegressor(min_samples_split=32, n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5468748943670695
0.4986089431535233


In [64]:
regressor = RandomForestRegressor(min_samples_split=64, n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5331422236336385
0.49868539614909063


In [65]:
regressor = RandomForestRegressor(bootstrap=False, min_samples_split=64, n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.5263367168348195
0.47409286087531943


In [150]:
# Best model
regressor = RandomForestRegressor(min_samples_split=64, n_estimators=100, max_depth=14, random_state=1)
regressor.fit(X_train, y_train)

print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))


0.528788165968977
0.4937007411714876


In [151]:
#pickle model to disk
dump(regressor, '20220818_randomforest_model.joblib')

['20220818_randomforest_model.joblib']

In [153]:
[print(x) for x in X.columns]

week
passA_curr
passC_curr
passY_curr
passT_curr
passI_curr
pass2_curr
rushA_curr
rushY_curr
rushT_curr
rush2_curr
recC_curr
recY_curr
recT_curr
rec2_curr
fum_curr
XPA_curr
XPM_curr
FGA_curr
FGM_curr
FG50_curr
defSack_curr
defI_curr
defSaf_curr
defFum_curr
defBlk_curr
defT_curr
defPtsAgainst_curr
defPassYAgainst_curr
defRushYAgainst_curr
defYdsAgainst_curr
gamesPlayed_curr
gamesPlayed_prior1
passA_prior1
passC_prior1
passY_prior1
passT_prior1
passI_prior1
pass2_prior1
rushA_prior1
rushY_prior1
rushT_prior1
rush2_prior1
recC_prior1
recY_prior1
recT_prior1
rec2_prior1
fum_prior1
XPA_prior1
XPM_prior1
FGA_prior1
FGM_prior1
FG50_prior1
defSack_prior1
defI_prior1
defSaf_prior1
defFum_prior1
defBlk_prior1
defT_prior1
defPtsAgainst_prior1
defPassYAgainst_prior1
defRushYAgainst_prior1
defYdsAgainst_prior1
gamesPlayed_prior2
passA_prior2
passC_prior2
passY_prior2
passT_prior2
passI_prior2
pass2_prior2
rushA_prior2
rushY_prior2
rushT_prior2
rush2_prior2
recC_prior2
recY_prior2
recT_prior2
rec2_p

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]