# Random Forest Models ~6 Minute Run

In [1]:
# load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy.signal import savgol_filter
import statsmodels.api as sm
import pymc3 as pm
import statsmodels.api as sm
from statsmodels.tools import add_constant
from itertools import combinations
# settings for seaborn plotting style
sns.set(color_codes=True)
# settings for seaborn plot sizes
sns.set(rc={'figure.figsize':(12,6)})
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [2]:
# Load data
df = pd.read_pickle('nfl_df_averages.pkl')

# remove home_homeAvg and home_awayAvg
df = df.drop(['home_homeAvg', 'home_awayAvg'], axis=1)

In [3]:
# create an X dataframe that is all columns from df with 'Avg' in it
X = df[[col for col in df.columns if 'Avg' in col]]

# create a y dataframe that is the 'winner' column
y = df['winner']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=621)

In [4]:
# create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, random_state=621)

# fit the model to the training data
rf.fit(X_train, y_train)

# predict the labels of the test set
y_pred = rf.predict(X_test)

# calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6156716417910447

In [5]:
# perform hyperparameter tuning on n_estimators, max_depth, and min_samples_leaf
# create a list of values for n_estimators
estimators = [100, 200, 300]

# create a list of values for max_depth
depth = [5, 10, 15]

# create a list of values for min_samples_leaf
leaf = [1, 2, 3]

# create a list of hyperparameter options
hyperparameters = dict(n_estimators=estimators, max_depth=depth, min_samples_leaf=leaf)

# create a random forest classifier
rf = RandomForestClassifier(random_state=621)

# use GridSearch to search for the best hyperparameters
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=0)

# fit the model to the training data
best_model = clf.fit(X_train, y_train)

# print the best hyperparameters
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best min_samples_leaf:', best_model.best_estimator_.get_params()['min_samples_leaf'])


Best n_estimators: 200
Best max_depth: 5
Best min_samples_leaf: 1


In [6]:
# report bestmodel results
print('Best Model Train Accuracy:', best_model.score(X_train, y_train))

Best Model Train Accuracy: 0.7143523920653442


In [10]:
# create a random forest classifier using n_estimators=200, max_depth=5, and min_samples_leaf=1
rf = RandomForestClassifier(n_estimators=200, max_depth=5, min_samples_leaf=1, random_state=621)

# fit the model to the training data
rf.fit(X_train, y_train)

# predict the labels of the test set
y_pred = rf.predict(X_test)

# calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6147388059701493

In [11]:
# report feature importances from best model
feature_importances = pd.DataFrame(best_model.best_estimator_.feature_importances_,
                                      index = X_train.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(10)


Unnamed: 0,importance
score_diff_homeAvg,0.08024
score_diff_awayAvg,0.07881
winner_homeAvg,0.059656
winner_awayAvg,0.048061
score_awayAvg,0.039329
score_homeAvg,0.028279
score_opp_homeAvg,0.025892
redzone_sucess_awayAvg,0.022518
sacks_awayAvg,0.01835
turnovers_awayAvg,0.017645
