In [1]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.model_selection import KFold

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [4]:
from src.split_dataframe import split_df
from src.remove_correlated_stats import remove_corr_stats
from src.my_predicted_stats import my_pred_stats

In [None]:
df = pd.read_csv('data/aggregated_2014_to_2019.csv')

In [None]:
df = df.sort_values(by=['season','week','game_id']).reset_index(drop=True)

In [None]:
df_train, df_test = split_df(df, 0.3)

# EDA

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df.describe()

In [None]:
df.info()

### Split df_train in half to help with visualizing all of the data

In [None]:
df_train_1 = df_train.iloc[:,3:12]
df_train_2 = df_train.iloc[:,12:20]
df_train_3 = df_train.iloc[:,20:30]
df_train_4 = df_train.iloc[:,30:40]
df_train_5 = df_train.iloc[:,40:]

In [None]:
fig,ax = plt.subplots(figsize=(10,10))

ax = sns.heatmap(df_train_1.corr(), annot=True, cmap='coolwarm')
ax.set_title('Correlation Map')
plt.savefig('images/heat_map_1.png')

In [None]:
fig,ax = plt.subplots(figsize=(10,10))

ax = sns.heatmap(df_train_2.corr(), annot=True, cmap='coolwarm')
ax.set_title('Correlation Map')
plt.savefig('images/heat_map_2.png')

In [None]:
fig,ax = plt.subplots(figsize=(10,10))

ax = sns.heatmap(df_train_3.corr(), annot=True, cmap='coolwarm')
ax.set_title('Correlation Map')
plt.savefig('images/heat_map_3.png')

In [None]:
fig,ax = plt.subplots(figsize=(10,10))

ax = sns.heatmap(df_train_4.corr(), annot=True, cmap='coolwarm')
ax.set_title('Correlation Map')
plt.savefig('images/heat_map_4.png')

In [None]:
fig,ax = plt.subplots(figsize=(10,10))

ax = sns.heatmap(df_train_5.corr(), annot=True, cmap='coolwarm')
ax.set_title('Correlation Map')
plt.savefig('images/heat_map_5.png')

## Based off of the EDA, decided to drop the following columns:
columns_to_remove = ['passAttempts', 'passYardPerAtt', 'passIntPct', 'pass40Plus', 'sacks_allowed_yard','rush1stdowns', 'rush40plus']

# Random Forest Classifier - My Stats

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.model_selection import KFold

In [12]:
df = pd.read_csv('data/aggregated_2014_to_2019.csv')
df = df.sort_values(by=['season','week','game_id']).reset_index(drop=True)
df = remove_corr_stats(df)
my_df = my_pred_stats(df)

In [13]:
#df

In [14]:
my_df = my_df.sort_values(by=['season','week','game_id']).reset_index(drop=True)
df_train, df_test = split_df(my_df, 0.3)
y_train = np.array(df_train.pop('win_game'))
X_train = np.array(df_train.iloc[:,5:])
y_test = np.array(df_test.pop('win_game'))
X_test = np.array(df_test.iloc[:,5:])

### 10 fold split

In [15]:
kfold = KFold(n_splits=10)
accuracies = []
precisions = []
recalls = []
for train_index, test_index in kfold.split(X_train):
    model = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
    model.fit(X_train[train_index], y_train[train_index])
    y_predict = model.predict(X_train[test_index])
    y_true = y_train[test_index]
    accuracies.append(accuracy_score(y_true, y_predict))
    precisions.append(precision_score(y_true, y_predict))
    recalls.append(recall_score(y_true, y_predict))
print("Accuracy:", np.average(accuracies))
print("Precision:", np.average(precisions))
print("Recall:", np.average(recalls))

Accuracy: 0.6
Precision: 0.6054403226602518
Recall: 0.5690980943455171


### Full model

In [16]:
model = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
model.fit(X_train, y_train)
df_test['win_perc'] = model.predict_proba(X_test)[:,1]
df_test['win_game'] = y_test
summarized_df = df_test[['game_id','season', 'week','team','opponent','win_perc','win_game']]
summarized_df = summarized_df.sort_values(by='game_id').reset_index(drop=True)
summarized_df.reset_index(drop=True, inplace=True)
summarized_df['predicted_win'] = None
for i in range(0, len(summarized_df), 2):
    if summarized_df.loc[i, 'win_perc'] > summarized_df.loc[i+1, 'win_perc']:
        summarized_df.loc[i, 'predicted_win'] = 1
        summarized_df.loc[i+1, 'predicted_win'] = 0
    else:
        summarized_df.loc[i, 'predicted_win'] = 0
        summarized_df.loc[i+1, 'predicted_win'] = 1
y_true = np.array(summarized_df['win_game'])
y_predict = np.array(summarized_df['predicted_win'])
print("Accuracy:", accuracy_score(y_true, y_predict))
print("Precision:", precision_score(y_true, y_predict))
print("Recall:", recall_score(y_true, y_predict))

Accuracy: 0.6147342995169082
Precision: 0.6111111111111112
Recall: 0.6155717761557178


In [17]:
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = df_train.columns[5:],
                                    columns=['importance']).sort_values('importance',ascending=False)

In [18]:
feature_importances

Unnamed: 0,importance
team_score,0.212932
wins_past_games,0.155317
qb_rating,0.140982
opponent_score,0.11104
playing_at_home,0.108067
fumbles,0.070883
passInt,0.056935
sacks,0.055226
passTD,0.0466
interceptions,0.042019


# Random Forest Classifier - All Stats

In [19]:
df = pd.read_csv('data/aggregated_2014_to_2019.csv')
df = df.sort_values(by=['season','week','game_id']).reset_index(drop=True)
df = remove_corr_stats(df)

In [20]:
df = df.sort_values(by=['season','week','game_id']).reset_index(drop=True)
df_train, df_test = split_df(df, 0.3)
y_train = np.array(df_train.pop('win_game'))
X_train = np.array(df_train.iloc[:,5:])
y_test = np.array(df_test.pop('win_game'))
X_test = np.array(df_test.iloc[:,5:])

### 10 fold split

In [21]:
kfold = KFold(n_splits=10)
accuracies = []
precisions = []
recalls = []
for train_index, test_index in kfold.split(X_train):
    model = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
    model.fit(X_train[train_index], y_train[train_index])
    y_predict = model.predict(X_train[test_index])
    y_true = y_train[test_index]
    accuracies.append(accuracy_score(y_true, y_predict))
    precisions.append(precision_score(y_true, y_predict))
    recalls.append(recall_score(y_true, y_predict))
print("Accuracy:", np.average(accuracies))
print("Precision:", np.average(precisions))
print("Recall:", np.average(recalls))

Accuracy: 0.5988309490847562
Precision: 0.5997123627023161
Recall: 0.5859019396574625


### Full model

In [22]:
model = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
model.fit(X_train, y_train)
df_test['win_perc'] = model.predict_proba(X_test)[:,1]
df_test['win_game'] = y_test
summarized_df = df_test[['game_id','season', 'week','team','opponent','win_perc','win_game']]
summarized_df = summarized_df.sort_values(by='game_id').reset_index(drop=True)
summarized_df.reset_index(drop=True, inplace=True)
summarized_df['predicted_win'] = None
for i in range(0, len(summarized_df), 2):
    if summarized_df.loc[i, 'win_perc'] > summarized_df.loc[i+1, 'win_perc']:
        summarized_df.loc[i, 'predicted_win'] = 1
        summarized_df.loc[i+1, 'predicted_win'] = 0
    else:
        summarized_df.loc[i, 'predicted_win'] = 0
        summarized_df.loc[i+1, 'predicted_win'] = 1
y_true = np.array(summarized_df['win_game'])
y_predict = np.array(summarized_df['predicted_win'])
print("Accuracy:", accuracy_score(y_true, y_predict))
print("Precision:", precision_score(y_true, y_predict))
print("Recall:", recall_score(y_true, y_predict))

Accuracy: 0.6457345971563981
Precision: 0.6445497630331753
Recall: 0.6460807600950119


In [23]:
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = df_train.columns[5:],
                                    columns=['importance']).sort_values('importance',ascending=False)

In [24]:
feature_importances

Unnamed: 0,importance
wins_past_games,0.085259
team_score,0.079887
qb_rating,0.058771
sacks_allowed,0.050632
opponent_score,0.048091
third_down_pct,0.040742
punt_inside_20_pct,0.039222
passPct,0.038062
passInt,0.034134
rushAttempts,0.029067
