In [1]:
# load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy.signal import savgol_filter
import statsmodels.api as sm
import pymc3 as pm
import statsmodels.api as sm
from statsmodels.tools import add_constant
from itertools import combinations
# settings for seaborn plotting style
sns.set(color_codes=True)
# settings for seaborn plot sizes
sns.set(rc={'figure.figsize':(12,6)})
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [2]:
df = pd.read_pickle('nfl_df_averages.pkl')
# create X and y
# define X to be df with all columns that have Avg in the name
X = df[[col for col in df.columns if 'Avg' in col]]
# remove home_homeAvg and home_awayAvg
X = X.drop(['home_homeAvg', 'home_awayAvg'], axis=1)
# y is the winner column
y = df['winner']
# do an 80/20 train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# create a Naive Bayes model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# fit the model
gnb.fit(X_train, y_train)
# predict on the test set
y_pred = gnb.predict(X_test)
# calculate the accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6277985074626866


In [4]:
# do hyperparameter tuning on the Naive Bayes model
from sklearn.model_selection import GridSearchCV
# define the parameter values that should be searched
param_grid = {'var_smoothing': np.logspace(0,-9, num=10)}
# instantiate the grid
grid = GridSearchCV(gnb, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid.fit(X, y)
# view the complete results
grid.cv_results_
# examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
# predict on the test set
y_pred = grid.predict(X_test)
# calculate the accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))


0.6316961919375086
{'var_smoothing': 1e-05}
GaussianNB(var_smoothing=1e-05)
Accuracy: 0.644589552238806


In [5]:
# count how many time y_pred is equal to y_test
np.sum(y_pred == y_test)
# count how many times y_pred is not equal to y_test
np.sum(y_pred != y_test)
# calculate the accuracy
np.sum(y_pred == y_test) / len(y_test)

0.644589552238806