<h1><center>Predicting NBA Game Attendance Using Numerous Regression Techniques </center></h1>

In [None]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import src.data.datasets as ds
import src.data.train_test_split as split
import src.features.clustering as clustering
import src.features.decomposition as decomposition
import src.features.statistical_tests as st
import src.initialize_jupyter
import src.models.ensemble_models as ensembles
import src.models.linear_models as linear_models
import src.models.metrics as metrics
import src.models.neural_networks as nn
import src.models.other_models as other_models
import src.visualization.data_exploration as de

## Load data
---
This data was originally scraped from multiple sources and includes game data since the 1998-1999 season.\
Three datasets have been created for use with many regression techniques:
- dataset_1: Game data since January, 2004 that includes Google Trends monthly popularity data per team, filtering of games based on usage of current day stadia, and stadium capacities )
- dataset_2: Game data since Fall, 1998 not including popularity, filtering, or capacities
- dataset_3: Game data since 1990 including filtering and capacities, but not popularity

In [None]:
ds.create_datasets()

In [None]:
datasets = ds.load_datasets()
datasets['3']

## Create train-test split of dataset and one hot encode categorical features
---

In [None]:
X_train, X_test, y_train, y_test, train = split.split(datasets['3'])
print("{} observations in training set".format(len(X_train), len(X_train.columns)))
print("{} observations in test set".format(len(X_test), len(X_test.columns)))
print("Features: {}:6 numerical, 84 binary categorical; Response: 1 numerical".format(len(X_test.columns)))
X_train.head()

## Visualizations for data exploration and feature engineering/selection
---

In [None]:
de.create_all_plots('dataset_3', train)

In [None]:
decomposition.pca_component_analysis('dataset_3', X_train)

In [None]:
decomposition.pca_cv('dataset_3',X_train,X_test,y_train,y_test)

In [None]:
clustering.elbow_method_kmeans('dataset_3', X_train)

In [None]:
clustering.elbow_method_kmeans('dataset_3', X_train, 25,6)

In [None]:
clustering.silhouettes('dataset_3', X_train)

## Further consideration of feature selection through analytic metrics
---

In [None]:
st.collect_tests('dataset_3', X_train, y_train)

### Creating subset of dataset_1 based on best guess of important features and preparing it for modeling

In [None]:
dataset_1_0 = ds.create_dataset_1_1(dataset_1)
print("dataset_1_1 contains: {} observations; {} features: 4 numerical, 3 multiclass categorical, 2 binary categorical; 1 Response".format(len(dataset_1_1), len(dataset_1_1.columns)-1))
X_train_0, X_test_1, y_train_1, y_test_1, train_1 = split.split(dataset_1_1)
print("After split contains:")
print("{} observations in training set".format(len(X_train_1), len(X_train_1.columns)))
print("{} observations in test set".format(len(X_test_1), len(X_test_1.columns)))
print("28 features: 4 numerical, 24 binary categorical; 1 Response")

In [None]:
X_train_1, X_test_1, y_train_1, y_test_1, train_1 = split.split(dataset_1_1)

## Modeling
---

### Start with linear models that generally have simpler hyperparameters to tune

In [None]:
linear_statistics = linear_models.collect_statistics('dataset_3', X_train, X_test, y_train, y_test).reset_index()
                                                        
# display(linear_statistics.sort_values(['Mean Absolute Error'])['index', Mean Absolute Error'].head())
# display(linear_statistics.sort_values(['R^2'], ascending = False)['index', 'R^2'].head())
# linear_statistics.sort_values(['Root Mean Square Error'])['index', 'Root Mean Square Error'].head()

In [None]:
linear_statistics = linear_statistics.reset_index()
linear_statistics

### Next, ensemble methods 

In [None]:
df = pd.DataFrame()
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1, random_state = 18, criterion = 'mae').fit(X_train, y_train).predict(X_test)
rf = metrics.apply_metrics('dataset_3: {}'.format('rf'), y_test, rf.ravel())
ab = AdaBoostRegressor(n_jobs = -1, random_state = 18).fit(X_train, y_train).predict(X_test)
ab = metrics.apply_metrics('dataset_3: {}'.format('ab'), y_test, ab.ravel())
gbr = GradientBoostingRegressor(random_state = 18).fit(X_train, y_train).predict(X_test)
gbr = metrics.apply_metrics('dataset_3: {}'.format('gbr'), y_test, gbr.ravel())
et =  ExtraTreesRegressor(n_jobs = -1, random_state = 18)
et = metrics.apply_metrics('dataset_3: {}'.format('et'), y_test, et.ravel())
df = pd.concat([rf,ab,gbr,et], axis =0)

In [None]:
df

In [None]:
random_forest_cv = ensembles.random_forest_grid_cv(X_train, y_train, cv = 5)

In [None]:
random_forest_cv_df = pd.DataFrame.from_dict(random_forest_cv.cv_results_)[['params', 'rank_test_R^2','mean_test_R^2','rank_test_Explained Variance Score', 'mean_test_Explained Variance Score',
                                                      'rank_test_Mean Absolute Error', 'mean_test_Mean Absolute Error', 'rank_test_Root Mean Square Error',
                                                      'mean_test_Root Mean Square Error','rank_test_Mean Absolute Percent Error',
                                                      'mean_test_Mean Absolute Percent Error']].sort_values(['rank_test_Mean Absolute Error', 
                                                                                                        'rank_test_R^2', 
                                                                                                        'rank_test_Root Mean Square Error',
                                                                                                        'rank_test_Explained Variance Score',
                                                                                                        'rank_test_Mean Absolute Percent Error'])
# random_forest_cv_df
random_forest_cv_df.head(10)['params'].values

In [None]:
to_save = Path().resolve().joinpath('models', 'cross_validation_outcomes', '{}.csv'.format('random_forest_random_cv_1'))
random_forest_cv_df.to_csv(to_save)

In [None]:
adaboost_cv = ensembles.adaboost_randomized_cv(X_train, y_train, n_iter = 25, cv = 5)

In [None]:
adaboost_cv.cv_results_

In [None]:
top_ten_by_all_rank = pd.DataFrame.from_dict(adaboost_cv.cv_results_)[['params', 'rank_test_R^2', 'rank_test_Explained Variance Score', 
                                                  'rank_test_Mean Absolute Error', 'rank_test_Root Mean Square Error',
                                                 'rank_test_Mean Absolute Percent Error']].sort_values(['rank_test_Mean Absolute Error', 
                                                                                                        'rank_test_R^2', 
                                                                                                        'rank_test_Explained Variance Score',                                                                                                    
                                                                                                        'rank_test_Root Mean Square Error',
                                                                                                        'rank_test_Mean Absolute Percent Error']).head(10)['params'].values
top_ten_by_all_rank

In [None]:
gradient_boosting_cv = ensembles.gradient_boosting_randomized_cv(X_train, y_train, n_iter = 25, cv= 5)

In [None]:
gradient_boosting_cv_boosting_cv.cv_results_

In [None]:
top_ten_by_all_rank = pd.DataFrame.from_dict(gradient_boosting_cv.cv_results_)[['params', 'rank_test_R^2', 'rank_test_Explained Variance Score', 
                                                  'rank_test_Mean Absolute Error', 'rank_test_Root Mean Square Error',
                                                 'rank_test_Mean Absolute Percent Error']].sort_values(['rank_test_Mean Absolute Error', 
                                                                                                        'rank_test_Explained Variance Score',
                                                                                                        'rank_test_R^2', 
                                                                                                        'rank_test_Root Mean Square Error',
                                                                                                        'rank_test_Mean Absolute Percent Error']).head(10)['params'].values
top_ten_by_all_rank

In [None]:
extra_trees_cv = ensembles.extra_trees_randomized_cv(X_train, y_train, n_iter = 25, cv = 5)

In [None]:
extra_trees_cv.cv_results_

In [None]:
top_ten_by_all_rank = pd.DataFrame.from_dict(extra_trees_cv.cv_results_)[['params', 'rank_test_R^2', 'rank_test_Explained Variance Score', 
                                                  'rank_test_Mean Absolute Error', 'rank_test_Root Mean Square Error',
                                                 'rank_test_Mean Absolute Percent Error']].sort_values(['rank_test_Mean Absolute Error', 
                                                                                                        'rank_test_Explained Variance Score',
                                                                                                        'rank_test_R^2', 
                                                                                                        'rank_test_Root Mean Square Error',
                                                                                                        'rank_test_Mean Absolute Percent Error']).head(10)['params'].values
top_ten_by_all_rank

### Other Models 

In [None]:
k_neighbors_randomized = other_models.k_neighbors_randomized_cv(X_train, y_train, 25, 5)

In [None]:
df = pd.DataFrame.from_dict(k_neighbors_randomized.cv_results_)
df[['params', 'mean_test_R^2', 'rank_test_R^2', 'rank_test_Explained Variance Score', 
                                                  'rank_test_Mean Absolute Error', 'rank_test_Root Mean Square Error',
                                                 'rank_test_Mean Absolute Percent Error']].sort_values(['rank_test_Mean Absolute Error', 
                                                                                                        'rank_test_Explained Variance Score',
                                                                                                        'rank_test_R^2', 
                                                                                                        'rank_test_Root Mean Square Error',
                                                                                                        'rank_test_Mean Absolute Percent Error'])[['params']].values

### Moving on to neural networks

In [None]:
nn_cv = nn.single_layer_network_grid_cv(X_train, y_train)

In [None]:
nn_statistics.sort_values(['R^2'], ascending = False).head()

In [None]:
neurons = [1, 5, 10, 15, 20, 25, 30]
param_grid = dict(neurons=neurons)
param_grid

In [None]:
preds = nn.single_layer_network('dataset_3',X_train, X_test, y_train)

In [None]:
preds.ravel()

In [None]:
metrics.apply_metrics('30', y_test, preds.ravel())

In [None]:
df

In [None]:
import numpy as np
df = pd.DataFrame()
for i in np.arange(5,131,5):
    preds = nn.single_layer_network(X_train, X_test, y_train, i)
    preds = metrics.apply_metrics('dataset_3: {} neurons'.format(i), y_test, preds.ravel())
    df = pd.concat([df, preds], axis = 0)
df

In [None]:
to_save = Path().resolve().joinpath('models', 'cross_validation_outcomes', '{}.csv'.format('single_layer_network_neuron_cv'))
df.to_csv(to_save)

In [None]:
preds = nn.single_layer_network('dataset_3',X_train, X_test, y_train, 10)
preds = metrics.apply_metrics('dataset_3', y_test, preds.ravel())

In [None]:
preds

In [None]:
pca_cv = 

In [None]:
pca_cv