In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time as time_calc
from time import time

import category_encoders as ce
from scipy.stats import randint
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from BorutaShap import BorutaShap

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import RFE
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.utils.fixes import loguniform
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score, ConfusionMatrixDisplay, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

import joblib
import operator

## Import data from nfl-data-py
##### https://pypi.org/project/nfl-data-py/

In [2]:
# Read csv with 5-years of nfl play-by-play data (2020-2021)
data = pd.read_csv(r'/Users/ttas2/Documents/Python/nfl-machine-learning-models/play_predictions/output_files/nfl_post_processing_multiclass_play_classification_data.csv')
df = pd.DataFrame(data)

df.shape

(33548, 83)

In [3]:
# Print columns with missing values
print(df.columns[df.isnull().any()].tolist())

[]


In [4]:
# Convert binary columns to integers
binary_columns = df.columns[df.isin([0,1]).all()].tolist()
df[binary_columns] = df[binary_columns].apply(pd.to_numeric, downcast='integer', errors='coerce', axis=1)

df.sample(3).style

Unnamed: 0,posteam,surface,drive_start,week,goal_to_go,shotgun,no_huddle,defteam_score,score_differential,no_score_prob,safety_prob,spread_line,div_game,play_sequence_game,run_formation,dtg_99to95,dtg_94to90,dtg_40to31,dtg_30to21,dtg_20to11,dtg_10to06,dtg_05to01,prev1_big_play_pass,prev2_big_play_pass,prev3_big_play_pass,prev1_big_play_run,prev2_big_play_run,prev3_big_play_run,prev1_negative_pass,prev2_negative_pass,prev3_negative_pass,prev1_negative_run,prev2_negative_run,prev3_negative_run,prev1_play_off_penalty,prev1_play_def_penalty,prev2_play_off_penalty,prev2_play_def_penalty,prev3_play_off_penalty,prev3_play_def_penalty,prev1_play_run_outside,prev1_play_run_inside,prev1_play_pass_deep,prev1_play_pass_short,prev2_play_run_outside,prev2_play_run_inside,prev2_play_pass_deep,prev2_play_pass_short,prev3_play_run_outside,prev3_play_run_inside,prev3_play_pass_deep,prev3_play_pass_short,prev1_incomplete_pass,prev2_incomplete_pass,prev3_incomplete_pass,prev3_yards_gained,prev1_wpa,prev2_wpa,prev3_wpa,prev1_shotgun,prev2_shotgun,prev3_shotgun,prev1_qb_hit,prev2_qb_hit,prev3_qb_hit,prev1_no_huddle,prev2_no_huddle,prev3_no_huddle,prev1_first_down_pass,prev2_first_down_pass,prev3_first_down_pass,prev1_first_down_run,prev2_first_down_run,prev3_first_down_run,prev1_effct_play,prev2_effct_play,prev3_effct_play,remaining_yards_per_down,two_min_warning,ep_sec_ratio,run_ratio_off_priors,run_ratio_def_priors,play_type
11311,DET,turf,kickoff,15,0,0,0,12,15,0.384871,0.000871,-13.0,0,146,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,3.5e-05,-0.000154,0.000896,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10.0,0,0.007285,0.066667,0.233333,outside
31992,TEN,turf,kickoff,9,0,0,0,3,18,0.003729,0.00436,7.0,0,119,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,2.0,0.01063,0.014166,-0.006187,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0,3.333333,0,0.001478,0.412698,0.527778,deep
22211,NE,turf,sudden_change,2,0,0,0,0,3,0.019669,0.001378,-5.5,1,21,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,5.0,5.9e-05,0.011849,-0.001239,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,3.333333,0,0.003579,0.5,0.4,short


In [None]:
# Target frequency
target_count = df.play_type.value_counts(normalize=True)
target_count

## Train test split data
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# split data into target and feature datasets
X, y = df.loc[:, df.columns != 'play_type'], df['play_type']

# initial_features = df.drop(['play_type'], axis=1)
initial_features = X.columns.to_list()

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=67, stratify=y)

print('Starting analysis with', X_train.shape[1], 'features')

## Encode categorical features
##### https://contrib.scikit-learn.org/category_encoders/leaveoneout.html

In [None]:
# Create list of features for each dtype
categorical_feat = list(X_train.select_dtypes(include='object'))

# One hot encoding of categorical features
# encoder = ce.OneHotEncoder(return_df=True, cols=categorical_feat, use_cat_names=True)

# Encoded column represents mean response over all rows for this category, providing one-column representation while avoiding direct response leakage
encoder = ce.LeaveOneOutEncoder(return_df=True, cols=categorical_feat)

X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.transform(X_test)

print(X_train.shape[1], 'features after categorical encoding')

## BorutaShap Feature selection
##### https://pypi.org/project/BorutaShap/
##### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [None]:
# Specify feature selection model
feature_model = GradientBoostingClassifier(loss='log_loss',
                                           n_estimators=100,
                                           max_depth=40,
                                           max_features=0.4,
                                           max_leaf_nodes=30,
                                           subsample=0.5,
                                           tol=1e-06,
                                           learning_rate=0.05,
                                           min_samples_split=0.05,
                                           criterion='friedman_mse',
                                           random_state=67,
                                           )

In [7]:
# no model selected default is Random Forest
Feature_Selector = BorutaShap(model=feature_model,
                              importance_measure='shap',
                              classification=True,
                              percentile=100,
                              pvalue=0.05,
                              )

Feature_Selector.fit(X=X_train,
                     y=y_train,
                     n_trials=100,
                     sample=False,
                     train_or_test='test',
                     normalize=True,
                     verbose=True,
                     random_state=67,
                     )

118 features after categorical encoding


In [None]:
Feature_Selector.plot(which_features='all', figsize=(18, 8))

In [8]:
# Drops features that were identified by BorutaShap as not important
features_to_remove = Feature_Selector.features_to_remove

X_train = X_train.drop(columns=features_to_remove)
X_test = X_test.drop(columns=features_to_remove)

# Print the shape of the new datasets
print('Training features:', X_train.shape[1])
print('Testing features:', X_test.shape[1])
print('Remaining features:', X_train.columns.to_list())

## Baseline model for comparison
##### https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

In [None]:
# Create and fit baseline model to compare performance
baseline_model = DummyClassifier(strategy='most_frequent', random_state=67)
baseline_model.fit(X_train, y_train)

# Calculate model accuracy on test data
y_baseline_pred = baseline_model.predict(X_test)
y_baseline_balanced_accuracy_score = balanced_accuracy_score(y_test, y_baseline_pred)
y_baseline_accuracy = accuracy_score(y_test, y_baseline_pred)

print('Baseline scores:')
print(f"balanced accuracy score (test): {(round(y_baseline_balanced_accuracy_score, 3) * 100)} percent")
print(f"accuracy score (test): {(round(y_baseline_accuracy, 3) * 100)} percent")

## Model pipeline 
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingRandomSearchCV.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
##### https://www.statsmodels.org/dev/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
##### https://imbalanced-learn.org/stable/references/over_sampling.html

In [None]:
# Create list of features for each dtype
categorical_features = list(X_train.select_dtypes(include='object'))
ordinal_features = X_train.columns[X_train.isin([1, 2, 3, 4, 5, 6]).all()].tolist()
boolean_features = X_train.columns[X_train.isin([0, 1]).all()].tolist()

# Create list of float features
numeric_features = [x for x in X_train.columns if x not in boolean_features]
numeric_features = [x for x in numeric_features if x not in categorical_features]
numeric_features = [x for x in numeric_features if x not in ordinal_features]

print('categorical features:', len(categorical_features), ':', categorical_features)
print('ordinal features:', len(ordinal_features), ':', ordinal_features)
print('numeric features:', len(numeric_features))
print('boolean features:', len(boolean_features))
print('total features:', len(X_train.columns))

In [None]:
# Outlier removal
def IQR_Outliers(X, features):

    indices = [x for x in X.index]
    out_index_list = []
        
    for col in features:
        # Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(X[col], 25.)
        Q3 = np.nanpercentile(X[col], 75.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
                
        outliers_index = X[col][(X[col] < lower) | (X[col] > upper)].index.tolist()
        outliers = X[col][(X[col] < lower) | (X[col] > upper)].values
        
        out_index_list.extend(outliers_index)
        
    # Use set to remove duplicates
    out_index_list = list(set(out_index_list))
    out_index_list.sort()

In [None]:
# Specify the transformations steps per category
num_transform = Pipeline(steps=[('smpl_imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                                ('iqr_outlier', IQR_Outliers(X_train, numeric_features)),
                                ('power_trans', PowerTransformer(method='yeo-johnson', copy=False)),
                                ('standard_scaler', StandardScaler()),
                               ])

cat_transform = Pipeline(steps=[('smpl_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')),
                               ])

ord_transform = Pipeline(steps=[('smpl_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                ('ordinal_encoder', OrdinalEncoder()),
                               ])
                                
Column_Tranform = ColumnTransformer(transformers=[('numeric_trans', num_transform, numeric_features),
                                                  ('categorical_trans', cat_transform, categorical_features),
                                                  ('ordinal_trans', ord_transform, ordinal_features),
                                                  ], remainder='passthrough')

In [None]:
# Specify tree model used for feature selection
feat_model = RandomForestClassifier(n_estimators=400,
                                    class_weight='balanced_subsample',
                                    n_jobs=-1,
                                    random_state=67,
                                   )

# Create feature selection model to be imbedded in the HalvingRandomSearchCV pipeline
Feat_Select_Eval = RFE(estimator=feat_model)

In [None]:
# Specify number of target classes
n_classes = y_train.nunique()

# Specify HalvingRandomSearchCV halving parameter
halving_parameter = 2.0

# Specify the HalvingRandomSearchCV minimum/maximun resources
max_resource = 1600
resource_divisor = 2.0
min_resource = int(round((max_resource / resource_divisor), 0))

In [None]:
def random_search():
    pipeline1 = Pipeline([
    ('col', Column_Tranform),
    ('feat', Feat_Select_Eval),
    ('smpl', ADASYN(n_neighbors=n_classes, sampling_strategy='not majority', n_jobs=-1, random_state=67)),
    ('clf', RandomForestClassifier()),
    ])

    pipeline2 = Pipeline([
    ('col', Column_Tranform),
    ('feat', Feat_Select_Eval),
    ('smpl', ADASYN(n_neighbors=n_classes, sampling_strategy='not majority', n_jobs=-1, random_state=67)),
    ('clf', ExtraTreesClassifier()),
    ])
    
    pipeline3 = Pipeline([
    ('col', Column_Tranform),
    ('feat', Feat_Select_Eval),
    ('smpl', ADASYN(n_neighbors=n_classes, sampling_strategy='not majority', n_jobs=-1, random_state=67)),
    ('clf', GradientBoostingClassifier()),
    ])
 
    # RandomForestClassifier
    parameters1 = {
    'feat__n_features_to_select': loguniform(0.40, 1.00),
    'feat__step': randint(2, 15),
    'clf__criterion': ['gini'],
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__max_depth': [None],
    'clf__max_samples': loguniform(0.60, 1.0),
    'clf__min_samples_split': randint(10, 70),
    'clf__min_samples_leaf': randint(10, 50),
    'clf__min_impurity_decrease': loguniform(1e-07, 1e-03),
    'clf__min_weight_fraction_leaf':  loguniform(1e-08, 1e-02),
    'clf__ccp_alpha':  loguniform(1e-05, 1e-01),
    'clf__bootstrap': [True],
    'clf__oob_score': [False],
    'clf__warm_start': [False],
    'clf__n_jobs': [6],
    'clf__random_state': [67],
    }

    # ExtraTreesClassifier
    parameters2 = {
    'feat__n_features_to_select': loguniform(0.50, 0.90),
    'feat__step': randint(2, 15),
    'clf__criterion': ['gini'],
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__max_depth': [None],
    'clf__max_leaf_nodes': [None],
    'clf__max_samples': loguniform(0.60, 1.0),
    'clf__min_samples_split': randint(10, 60),
    'clf__min_samples_leaf': randint(10, 60),
    'clf__min_weight_fraction_leaf': loguniform(1e-06, 1e-02),
    'clf__min_impurity_decrease': loguniform(1e-09, 1e-05),
    'clf__ccp_alpha': loguniform(1e-06, 1e-02),
    'clf__bootstrap': [True],
    'clf__oob_score': [False],
    'clf__warm_start': [False],
    'clf__n_jobs': [6],
    'clf__random_state': [67],
    }
 
    # GradientBoostingClassifier
    parameters3 = {
    'feat__n_features_to_select': loguniform(0.80, 1.00),
    'feat__step': randint(2, 15),
    'clf__max_features': ['sqrt', 'log2', None],  
    'clf__learning_rate': loguniform(1e-04, 1e-01),
    'clf__ccp_alpha': loguniform(1e-07, 1e-03),
    'clf__max_depth': randint(5, 20),
    'clf__max_leaf_nodes': randint(10, 80),
    'clf__min_samples_split': randint(40, 120),
    'clf__min_impurity_decrease': loguniform(1e-04, 1e-01),
    'clf__min_samples_leaf': randint(10, 70),
    'clf__n_iter_no_change': [150, 175, 200, None],
    'clf__tol': loguniform(1e-09, 1e-06),
    'clf__validation_fraction': loguniform(0.10, 0.30),
    'clf__warm_start': [False],
    'clf__random_state': [67],
    }

    pars = [parameters1, parameters2, parameters3]
    pips = [pipeline1, pipeline2, pipeline3]

    result = []
    
    for i in range(len(pars)):
        
        rs = HalvingRandomSearchCV(pips[i],
                                   pars[i],
                                   factor=halving_parameter,
                                   resource='clf__n_estimators',
                                   n_candidates='exhaust',
                                   min_resources=min_resource,
                                   max_resources=max_resource,
                                   scoring='balanced_accuracy',
                                   aggressive_elimination=False,
                                   return_train_score=False,
                                   refit=True,
                                   cv=5,
                                   n_jobs=6,
                                   verbose=1,
                                   random_state=67,
                                  )

        start = time()
        
        # Fit models on training data
        rs = rs.fit(X_train, y_train)
        
        # Apply models to test data to determine model performance
        y_pred = rs.predict(X_test)
        y_pred_prob = rs.predict_proba(X_test)[:, 1]

        print("Hyperparameter search completed in %.2f minutes" % ((time() - start)/ 60))
        print(' ')
        
        # storing model results
        result.append({
        'grid': rs,
        'cv results': rs.cv_results_,
        'train score': rs.best_score_,
        'best params': rs.best_params_, 
        'best estimator': rs.best_estimator_,
        'feature importances': rs.best_estimator_.named_steps['clf'].feature_importances_,
        'selected feature count': rs.best_estimator_.named_steps['feat'].n_features_,
        'selected features alt': rs.best_estimator_.named_steps['feat'].get_feature_names_out(),
        'selected features': rs.best_estimator_.named_steps['feat'].support_,
        'test balanced accuracy score': balanced_accuracy_score(y_test, y_pred),
        'test accuracy score': accuracy_score(y_test, y_pred),
        'test weighted f1 score': f1_score(y_test, y_pred, average='weighted'),
        'test classification report': classification_report(y_test, 
                                                            y_pred, 
                                                            target_names=['short','deep','inside','outside']),
        'cv': rs.cv,
        'model #': i + 1
        })  

    # sorting results by best test score
    result = sorted(result, key=operator.itemgetter('test balanced accuracy score'), reverse=True)
    
    print(' ')
    
    for element in result:
        if element['model #']==1:
            print('RandomForest classifier: ')
        elif element['model #']==2:
            print('ExtraTrees classifier: ')
        elif element['model #']==3:
            print('GradientBoosting classifier: ')
        else:
            print('Other: ')
            
        print('Parameters:         ' + str(element['best params']))
        print(' ')
        print('Candidate features:', initial_features)
        print('')
        print(str(element['selected feature count']) + ' features selected during evaluation')
        print('Features:  ' + str(element['selected features alt']))
        print(' ')
        print('Train balanced accuracy score: ' + str(element['train score']))
        print('Test balanced accuracy score:  ' + str(element['test balanced accuracy score']))
        print('Test accuracy score:           ' + str(element['test accuracy score']))
        print('Test weighted f1 score:        ' + str(element['test weighted f1 score']))
        print(' ')
        print(str(element['test classification report']))
        ConfusionMatrixDisplay.from_predictions(y_test, y_pred, normalize='true')
        plt.show()
        
        # Print most significant features per model
        f_list = []
        total_importance = 0
        included_feats = []
        
        for f in zip(initial_features,
                     rs.best_estimator_.named_steps['feat'].get_feature_names_out(),
                     rs.best_estimator_.named_steps['clf'].feature_importances_):
            f_list.append(f)
            total_importance += f[2]

        for f in zip(initial_features,
                     rs.best_estimator_.named_steps['feat'].get_feature_names_out(),
                     rs.best_estimator_.named_steps['clf'].feature_importances_):
            if f[2] > .01:
                included_feats.append(f[0])
        
        print('\n',"Cumulative Importance =", total_importance)
        
        df2 = pd.DataFrame(f_list, columns =['feature','index','importance']).sort_values(by='importance', 
                                                                                  ascending=False)
        df2['cum_sum'] = df2['importance'].cumsum()
        print(df2.head(40))
        print(' ')
        print(' ')

    # Save best model as pickle file
    joblib.dump(rs.best_params_, 'multiclass_play_classifier_results.pkl', compress = 1)

In [None]:
# Define start time of this stage in the process
start = time_calc.time()

In [None]:
%%capture --no-stdout --no-display
random_search()

In [None]:
# Define end time for process and calculate total time elapsed
end = time_calc.time()
print(round((end - start)/3600, 2), 'hours to complete hyperparameter tuning process')