In [48]:
import sys
import pandas as pd
import numpy as np
sys.path.append('c:\\Users\\tyler\\OneDrive\\Documents\\Python\\NFL')
from backend.preprocess.preprocess import main as load_data
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import os
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [49]:
os.chdir('c:\\Users\\tyler\\OneDrive\\Documents\\Python\\NFL')
os.getcwd()

'c:\\Users\\tyler\\OneDrive\\Documents\\Python\\NFL'

In [50]:
# Load data
df = load_data()

last_season = df[[index[0].year >= 2021 for index in df.index]]
df = df[[index[0].year < 2021 for index in df.index]]
X = df.drop(['y'], axis=1)
y = df[['y']]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

# Pipeline
pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('nn', MLPClassifier(random_state=1))
    ]
)


Executing load_data ...
	Done: 4.8376s
Executing preprocess ...
	Done: 2.1822s
Executing load_target_data ...
	Done: 0.9124s
Executing merge_x_y ...
	Done: 0.0490s


In [51]:
parameters = {
    'nn__hidden_layer_sizes': [
        (100, ), (200, ), (300, ),
        (100, 100), (200, 200), (300, 300), 
        (200, 100), (300, 100), (300, 200), (300, 200, 100), 
        (100, 100, 100), (200, 200, 200), (300, 300, 300), 
        (200, 100, 100), (200, 200, 100), (300, 200, 100), (300, 300, 200), (300, 300, 100), (300, 200, 200)
    ],
    'nn__activation': ['tanh'],
    'nn__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'nn__learning_rate_init': [.1, .01, .001, .0001, .00001]
}

In [52]:
search = GridSearchCV(
    pipe,
    parameters,
    scoring='accuracy',
    n_jobs=-1,
    cv=KFold(n_splits=5, shuffle=True, random_state=1)
).fit(X_train, y_train)

In [57]:
results_df = pd.DataFrame(search.cv_results_)
results_df.sort_values(by=['rank_test_score'], inplace=True)
results_df.set_index(['param_nn__activation', 'param_nn__hidden_layer_sizes', 'param_nn__learning_rate', 'param_nn__learning_rate_init'], inplace=True)
results_df[['mean_test_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean_test_score,mean_test_score,rank_test_score
param_nn__activation,param_nn__hidden_layer_sizes,param_nn__learning_rate,param_nn__learning_rate_init,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tanh,"(200, 200, 100)",constant,0.00001,0.627128,0.627128,1
tanh,"(200, 200, 100)",invscaling,0.00001,0.627128,0.627128,1
tanh,"(200, 200, 100)",adaptive,0.00001,0.627128,0.627128,1
tanh,"(100, 100)",constant,0.00001,0.624468,0.624468,4
tanh,"(100, 100)",invscaling,0.00001,0.624468,0.624468,4
tanh,...,...,...,...,...,...
tanh,"(300,)",constant,0.10000,0.559043,0.559043,280
tanh,"(300,)",invscaling,0.10000,0.559043,0.559043,280
tanh,"(300, 200, 200)",adaptive,0.01000,0.555851,0.555851,283
tanh,"(300, 200, 200)",constant,0.01000,0.555851,0.555851,283


In [54]:
search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('nn',
                 MLPClassifier(activation='tanh',
                               hidden_layer_sizes=(200, 200, 100),
                               learning_rate_init=1e-05, random_state=1))])

In [55]:
scores = cross_val_score(
    search.best_estimator_, 
    X_train, y_train, 
    cv=KFold(n_splits=5, shuffle=True, random_state=1)
)
print(f'Mean: {scores.mean()} Std: {scores.std()}')
print(search.best_estimator_.score(X_test, y_test))

Mean: 0.6271276595744681 Std: 0.018985710954866737
0.6148936170212767
