In [1]:
import warnings
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
warnings.simplefilter('ignore', category=UserWarning)
warnings.filterwarnings('ignore',category=ImportWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

from scipy.stats import randint, loguniform
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN

from sklearn.impute import SimpleImputer
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier


In [2]:
# Train test split parameters
test_holdout_percentage = 0.25

# Leave One Out Encoder Sigma value - 0.04 is the top performing value.
sigma = 0.05                                                        # Tested parameters: 0.04, 0.05, 0.10, 0.30, 0.60

# Feature scaler
feature_scaler = StandardScaler()                                   # Tested parameters: MinMaxScaler(), StandardScaler(), MaxAbsScaler(), RobustScaler()

# HalvingRandomSearchCV parameters
scoring = 'f1_weighted'
n_cross_validation = 3

# Specify the HalvingRandomSearchCV parameters
halving_parameter = 2.0
max_resource = 1000
resource_divisor = 2.0
min_resource = int(round((max_resource / resource_divisor), 0))


In [3]:
# Create timer to calculate total workbook time in hours
start_time = time.time()

## IMPORT PROCESSED NFL-DATA-PY CSV FILE
##### https://pypi.org/project/nfl-data-py/

In [29]:
# Import csv file from nfl-data-py
df = pd.read_csv(r'/Users/ttas2/Documents/Python/nfl-machine-learning-models/output_files/nfl_post_processing_multiclass_play_classification_data.csv')

df.shape

(58083, 156)

In [30]:
# Print columns with missing values
print(df.columns[df.isnull().any()].tolist())

[]


In [31]:
# Convert binary columns to integers
binary_columns = df.columns[df.isin([0,1]).all()].tolist()
df[binary_columns] = df[binary_columns].apply(pd.to_numeric, downcast='integer', errors='coerce', axis=1)

df.sample(2)

Unnamed: 0,week,posteam,posteam_type,defteam,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,qtr,...,remain_yds_prod_def_dl_count,remain_yds_div_def_db_count,remain_yds_prod_def_db_count,remain_yds_div_score_diff,remain_yds_prod_score_diff,run_ratio_off_priors,run_ratio_def_priors,posteam_season,defteam_season,play_type
1237,4,ARI,away,SF,75.0,862,862,862,2,4,...,13.333333,0.833333,13.333333,8.030303,1.383648,0.424928,0.338324,ari_2023,sf_2023,outside
28838,12,KC,away,LV,36.0,600,600,2400,1,2,...,13.333333,0.666667,16.666667,8.412698,1.320755,0.356758,0.455181,kc_2023,lv_2023,short


In [32]:
# Target frequency
target_count = df.play_type.value_counts(normalize=True)

target_count

play_type
short      0.409070
inside     0.217534
outside    0.202125
deep       0.171272
Name: proportion, dtype: float64

## TRAIN TEST SPLIT
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [33]:
# split data into target and feature datasets
X, y = df.loc[:, df.columns != 'play_type'], df['play_type']

initial_features = X.columns.to_list()

# Print target labels
y.head(3)

0    outside
1     inside
2      short
Name: play_type, dtype: object

In [34]:
# Label encoding
le = LabelEncoder()

# Convert target labels to integers
y = le.fit_transform(y)


In [35]:

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_holdout_percentage, random_state=67)


## BASELINE MODEL
##### https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

In [36]:
# Create and fit baseline model to compare performance
baseline_model = DummyClassifier(strategy='most_frequent', random_state=67)
baseline_model.fit(X_train, y_train)

# Calculate model accuracy on test data
y_baseline_pred = baseline_model.predict(X_test)

print(f"Baseline f1 score: {round(f1_score(y_test,y_baseline_pred, average='weighted')*100, 1)}%")


Baseline f1 score: 24.1%


## MODEL PIPELINE
##### https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
##### https://www.statsmodels.org/dev/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html
##### https://imbalanced-learn.org/stable/references/over_sampling.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingRandomSearchCV.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
##### https://xgboost.readthedocs.io/en/stable/parameter.html

In [37]:
# Create feature type lists for column transform stage of the pipeline
ordinal_features = X_train.columns[X_train.isin([1,2,3,4,5]).all()].tolist()
categorical_features = list(X_train.select_dtypes(include='object'))
boolean_features = X_train.columns[X_train.isin([0, 1]).all()].tolist()

# define numeric features as remaining features not in ordinal categorical or boolean lists
numeric_features = list(set(X_train.columns) - set(ordinal_features) - set(categorical_features) - set(boolean_features))

#print('categorical features:', len(categorical_features), ':', categorical_features)
print('ordinal features:', len(ordinal_features), ':', ordinal_features)
print(' ')
print('boolean features:', len(boolean_features), ':', boolean_features)
print(' ')
print('numeric features:', len(numeric_features), ':', numeric_features)
print(' ')
print('categorical features:', len(categorical_features), ':', categorical_features)
print(' ')
print('feature count:', len(initial_features))

ordinal features: 4 : ['game_half', 'qtr', 'down', 'remaining_downs']
 
 
numeric features: 69 : ['quarter_seconds_remaining', 'remain_yds_prod_def_dl_count', 'off_wr_count', 'off_te_count', 'prev3_yards_gained', 'remain_yds_prod_score_diff', 'n_offense', 'off_rb_count', 'off_ol_count', 'ol_to_dl_ratio', 'no_score_prob', 'defteam_timeouts_remaining', 'hb_to_lb_ratio', 'n_defense', 'total_line', 'prev3_wpa', 'game_humidity', 'remain_yds_div_def_db_count', 'game_wind_prod_game_temp', 'score_differential', 'game_temp', 'game_seconds_remaining', 'safety_prob', 'drive_play_count', 'score_differential_norm', 'remaining_yards_per_down', 'ep_game_sec_ratio', 'run_ratio_def_priors', 'yardline_100', 'posteam_score', 'prev2_yards_gained', 'wp', 'game_temp_prod_game_humidity', 'run_ratio_off_priors', 'remain_yds_div_def_dl_count', 'play_sequence_game', 'ydstogo', 'remain_yds_prod_off_hb_count', 'spread_line', 'game_temp_div_game_humidity', 'remain_yds_prod_def_db_count', 'remain_yds_div_def_box', 

In [38]:
# Custom transformer for IQR outlier exclusion
class IQRTransformer:
    def __init__(self, numerical_cols):
        self.numerical_cols = numerical_cols
        self.lower_bound = None
        self.upper_bound = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            # Calculate the IQR for each numerical column
            q1 = X[self.numerical_cols].quantile(0.25)
            q3 = X[self.numerical_cols].quantile(0.75)
            iqr = q3 - q1

            # Define the lower and upper bounds for outliers
            self.lower_bound = (q1 - 1.5 * iqr).to_dict()
            self.upper_bound = (q3 + 1.5 * iqr).to_dict()
        else:
            # Calculate the IQR for each numerical column
            q1 = np.quantile(X[:, :], 0.25, axis=0)
            q3 = np.quantile(X[:, :], 0.75, axis=0)
            iqr = q3 - q1

            # Define the lower and upper bounds for outliers
            self.lower_bound = (q1 - 1.5 * iqr).tolist()
            self.upper_bound = (q3 + 1.5 * iqr).tolist()

        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            # Exclude outliers based on the IQR for each numerical column
            x_outlier_removed = X.copy()
            for col in self.numerical_cols:
                if col in self.lower_bound and col in self.upper_bound:
                    x_outlier_removed = x_outlier_removed[
                        (x_outlier_removed[col] >= self.lower_bound[col]) & (x_outlier_removed[col] <= self.upper_bound[col])
                    ].dropna()
        else:
            # Exclude outliers based on the IQR for each numerical column
            x_outlier_removed = X.copy()
            for i, col in enumerate(self.numerical_cols):
                if col in self.lower_bound and col in self.upper_bound:
                    lower_bound = self.lower_bound[col]
                    upper_bound = self.upper_bound[col]
                    x_outlier_removed = x_outlier_removed[
                        (x_outlier_removed[:, i] >= lower_bound) & (x_outlier_removed[:, i] <= upper_bound)
                    ].dropna()

        return x_outlier_removed

In [39]:
# Specify the transformations per data type
num_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                            ('iqr_outlier', IQRTransformer(numerical_cols=numeric_features)),
                            ('scaler', feature_scaler),
                           ])

cat_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                            ('cat_encoder', LeaveOneOutEncoder(handle_missing='value', handle_unknown='value', sigma=sigma, random_state=67)),
                            ])

ord_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                            ('ordinal_encoder', LeaveOneOutEncoder(handle_missing='value', handle_unknown='value', sigma=sigma, random_state=67)),
                           ])

preprocessing = ColumnTransformer(transformers=[('numeric_transform', num_trans, numeric_features),
                                                ('categorical_transform', cat_trans, categorical_features),
                                                ('ordinal_transform', ord_trans, ordinal_features),
                                                ],
                                     remainder='passthrough',
                                    )

# Define the models
models = [
    ('RandomForest', RandomForestClassifier()),
    ('ExtraTrees', ExtraTreesClassifier()),
    #('GradientBoosting', GradientBoostingClassifier()),
    #('AdaBoost', AdaBoostClassifier()),
    #('XGBoost', XGBClassifier()),
]

# Create and run the pipeline
for model_name, model in models:
    pipeline = Pipeline([
        ('pre', preprocessing),
        ('select', SelectKBest()),
        ('smpl', ADASYN(sampling_strategy='not majority', random_state=67)),
        ('clf', model)
    ])
    
    params = {}
    
    if model_name == 'RandomForest':
        params = {
            'select__k': randint(20, 60),
            'smpl__n_neighbors': randint(4, 8),                             # Only for sampling_strategy='not majority'
            'clf__bootstrap': [True],
            'clf__ccp_alpha': loguniform(1e-06, 1e-01),                     # cost-complexity pruning is an algorithm used to prune a tree to avoid over-fitting
            'clf__criterion': ['gini','entropy'],
            'clf__max_depth': randint(5, 30),
            'clf__max_features': loguniform(0.10, 0.35), 
            'clf__min_impurity_decrease': loguniform(1e-09, 1e-04),
            'clf__max_samples': loguniform(0.02, 0.49),                     # Only for bootstrap=True
            'clf__min_samples_leaf': loguniform(0.005, 0.20),
            'clf__min_samples_split': loguniform(0.005, 0.20),
            'clf__min_weight_fraction_leaf': loguniform(0.005, 0.20),
            'clf__oob_score': [True, False],                                # Only for bootstrap=True
            'clf__warm_start': [True, False],
            'clf__n_jobs': [6],
            'clf__random_state': [67],
        }
    
    elif model_name == 'Extra Trees':
        params = {
            'select__k': randint(20, 60),
            'smpl__n_neighbors': randint(4, 8),                       # Only for sampling_strategy='not majority
            'clf__bootstrap': [True, False],
            'clf__ccp_alpha': loguniform(1e-06, 1e-01),               # cost-complexity pruning is an algorithm used to prune a tree to avoid over-fitting
            'clf__criterion': ['gini','entropy'],
            'clf__max_depth': randint(5, 80),
            'clf__max_features': loguniform(0.50, 0.95),
            'clf__max_leaf_nodes': randint(20, 70),
            #'clf__max_samples': loguniform(0.10, 0.50),               # Only for bootstrap=True
            'clf__min_impurity_decrease': loguniform(1e-05, 1e-01),
            'clf__min_samples_leaf': loguniform(0.05, 0.30),
            'clf__min_samples_split': loguniform(0.005, 0.15),
            'clf__min_weight_fraction_leaf': loguniform(0.05, 0.25),
            'clf__oob_score': [False],                                # Only for bootstrap=True
            'clf__warm_start': [True, False],
            'clf__n_jobs': [6],
            'clf__random_state': [67],
        }
    
    elif model_name == 'GradientBoosting':
        params = {
            'select__k': randint(20, 60),
            'smpl__n_neighbors': randint(4, 8),                       # Only for sampling_strategy='not majority
            'clf__criterion': ['friedman_mse'],
            'clf__ccp_alpha': loguniform(1e-06, 1e-01),  # cost-complexity pruning is an algorithm used to prune a tree to avoid over-fitting
            'clf__learning_rate': loguniform(1e-05, 1e-00),
            #'clf__loss': ['log_loss'],                                 # Not available for multiclass    
            'clf__max_depth': randint(25, 60),
            'clf__max_features': loguniform(0.45, 0.85), 
            'clf__max_leaf_nodes': randint(20, 50),
            'clf__min_weight_fraction_leaf': loguniform(0.30, 0.50),   # Must be <= 0.5
            'clf__min_impurity_decrease': loguniform(1e-08, 1e-04),
            'clf__min_samples_leaf': loguniform(0.01, 0.25),
            'clf__min_samples_split': loguniform(0.10, 0.35),
            'clf__n_iter_no_change': [200],
            'clf__tol': loguniform(1e-08, 1e-03),
            'clf__validation_fraction': loguniform(0.05, 0.15),
            'clf__warm_start': [True, False],
            'clf__subsample': loguniform(0.65, 1.0),
            'clf__random_state': [67],
        }
        
    elif model_name == 'AdaBoost':
        params = {
            'select__k': randint(20, 60),
            'smpl__n_neighbors': randint(4, 8),                       # Only for sampling_strategy='not majority
            'clf__algorithm': ['SAMME','SAMME.R'],
            'clf__learning_rate': loguniform(1e-08, 1e-01),
            'clf__random_state': [67],
        }
    
    elif model_name == 'XGBoost':
        params = {
            'select__k': randint(20, 60),
            'smpl__n_neighbors': randint(4, 8),                       # Only for sampling_strategy='not majority
            'clf__booster': ['gbtree','dart'],
            'clf__max_depth': [6],
            'clf__grow_policy': ['depthwise','lossguide'],
            'clf__objective': ['multi:softprob'],
            'clf__eval_metric': ['auc'],
            'clf__seed': [67],
        }

    search = HalvingRandomSearchCV(
        estimator=pipeline,
        param_distributions=params,
        scoring=scoring,
        factor=halving_parameter,
        resource='clf__n_estimators',
        n_candidates='exhaust',
        min_resources=min_resource,
        max_resources=max_resource,
        aggressive_elimination=False,
        return_train_score=True,
        refit=True,
        cv=n_cross_validation,
        n_jobs=6,
        error_score='raise',
        random_state=67,
        verbose=0,
    )
    
    search.fit(X_train, y_train)
    
    # Print results
    print(f"Best performance for {model_name}: {search.best_score_}")
    print(f"Best parameters: {search.best_params_}")

    print("\n")


Best performance for RandomForest: 0.354247353552716
Best parameters: {'clf__bootstrap': True, 'clf__ccp_alpha': 0.0005361140014825931, 'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__max_features': 0.23614145682895635, 'clf__max_samples': 0.05776517684749231, 'clf__min_impurity_decrease': 1.995214754838472e-09, 'clf__min_samples_leaf': 0.020787879125554312, 'clf__min_samples_split': 0.010976120471026344, 'clf__min_weight_fraction_leaf': 0.1559199188506055, 'clf__n_jobs': 6, 'clf__oob_score': False, 'clf__random_state': 67, 'clf__warm_start': True, 'select__k': 56, 'smpl__n_neighbors': 7, 'clf__n_estimators': 1000}

Best performance for ExtraTrees: 0.3708411941095795
Best parameters: {'clf__n_estimators': 500}



In [40]:
# Calculate workbook processing time in hours
end_time = time.time()
total_time = end_time - start_time

print('Total HalvingRandomSearchCV runtime:', round(total_time / 3600, 2), 'hours')

Total HalvingRandomSearchCV runtime: 0.16 hours
