In [1]:
import warnings
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
warnings.simplefilter('ignore', category=UserWarning)
warnings.filterwarnings('ignore',category=ImportWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

from scipy.stats import randint, loguniform
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from imblearn.over_sampling import ADASYN

from sklearn.impute import SimpleImputer
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

In [2]:
# Train test split parameters
test_holdout_percentage = 0.30

# Leave One Out Encoder Sigma value - 0.04 is the top performing value.
sigma = 0.04                                                        # Tested parameters: 0.04, 0.05, 0.10, 0.30, 0.60

# Feature scaler
feature_scaler = StandardScaler()                                    # Tested parameters: MinMaxScaler(), StandardScaler(), MaxAbsScaler(), RobustScaler()

# HalvingRandomSearchCV parameters
scoring = 'neg_mean_squared_error'
n_cross_validation = 3

# Specify the HalvingRandomSearchCV parameters
halving_parameter = 2.0
max_resource = 500
resource_divisor = 2.0
min_resource = int(round((max_resource / resource_divisor), 0))


In [3]:
# Create timer to calculate total workbook time in hours
start_time = time.time()

## IMPORT PROCESSED NFL-DATA-PY CSV FILE
##### https://pypi.org/project/nfl-data-py/

In [4]:
# Import csv file from nfl-data-py
df = pd.read_csv(r'/Users/ttas2/Documents/Python/nfl-machine-learning-models/output_files/nfl_post_processing_run_pass_classification_data.csv')

df.shape

(60331, 160)

In [5]:
# Print columns with missing values
print(df.columns[df.isnull().any()].tolist())

[]


In [6]:
# Convert binary columns to integers
binary_columns = df.columns[df.isin([0,1]).all()].tolist()
df[binary_columns] = df[binary_columns].apply(pd.to_numeric, downcast='integer', errors='coerce', axis=1)

df.sample(2)

Unnamed: 0,week,posteam,posteam_type,defteam,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,qtr,...,remain_yds_div_def_dl_count,remain_yds_prod_def_dl_count,remain_yds_div_def_db_count,remain_yds_prod_def_db_count,remain_yds_div_score_diff,remain_yds_prod_score_diff,run_ratio_off_priors,run_ratio_def_priors,posteam_season,defteam_season
19450,14,DET,home,MIN,56.0,481,481,481,2,4,...,3.0,3.0,0.6,15.0,0.25,36.0,0.194444,0.166667,det_2022,min_2022
935,16,ARI,home,TB,91.0,659,1559,1559,2,3,...,3.166667,12.666667,1.266667,31.666667,0.0,0.0,0.585714,0.528571,ari_2022,tb_2022


In [7]:
# Target frequency
target_count = df.play_type.value_counts(normalize=True)

target_count

play_type
pass    0.598382
run     0.401618
Name: proportion, dtype: float64

In [8]:
df['play_type'] = np.where(df['play_type'] == 'pass', 1, 0)


## TRAIN TEST SPLIT
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [9]:
# split data into target and feature datasets
X, y = df.loc[:, df.columns != 'play_type'], df['play_type']

initial_features = X.columns.to_list()

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_holdout_percentage, random_state=67)

# Specify number of target classes
n_classes = round(y_train.nunique() * adasyn_class_multiplier, 0)

# Used to balancing the effect of XGBClassifier weights on imbalanced dataset: scale_pos_weight = total_majority observations / total minority observations
target_count = y_train.value_counts()
scale_pos_weight = round(target_count[1] / target_count[0], 2)

print('XGBClassifier scale_pos_weight:', scale_pos_weight)

159 initial features before processing


## BASELINE MODEL
##### https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

In [10]:
# Create and fit baseline model to compare performance
baseline_model = DummyRegressor(strategy='mean', )
baseline_model.fit(X_train, y_train)

# Calculate model accuracy on test data
y_baseline_pred = baseline_model.predict(X_test)

print(f"Baseline accuracy: {round(mean_squared_error(y_test, y_baseline_pred)*100, 1)}%")


Baseline accuracy: 59.8%
Baseline f1 score: 74.9%


## MODEL PIPELINE
##### https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
##### https://www.statsmodels.org/dev/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html
##### https://imbalanced-learn.org/stable/references/over_sampling.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingRandomSearchCV.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
##### https://xgboost.readthedocs.io/en/stable/parameter.html

In [11]:
# Create feature type lists for column transform stage of the pipeline
ordinal_features = X_train.columns[X_train.isin([1,2,3,4,5]).all()].tolist()
categorical_features = list(X_train.select_dtypes(include='object'))
boolean_features = X_train.columns[X_train.isin([0, 1]).all()].tolist()

# define numeric features as remaining features not in ordinal categorical or boolean lists
numeric_features = list(set(X_train.columns) - set(ordinal_features) - set(categorical_features) - set(boolean_features))

#print('categorical features:', len(categorical_features), ':', categorical_features)
print('ordinal features:', len(ordinal_features), ':', ordinal_features)
print(' ')
print('boolean features:', len(boolean_features), ':', boolean_features)
print(' ')
print('numeric features:', len(numeric_features), ':', numeric_features)
print(' ')
print('categorical features:', len(categorical_features), ':', categorical_features)
print(' ')
print('feature count:', len(initial_features))

ordinal features: 4 : ['game_half', 'qtr', 'down', 'remaining_downs']
 
 
numeric features: 73 : ['remain_yds_prod_def_dl_count', 'off_rb_count', 'td_prob', 'run_ratio_off_priors', 'hb_to_lb_ratio', 'remaining_yards_per_down', 'off_te_count', 'no_score_prob', 'drive_no_huddle_pcnt', 'wp', 'half_seconds_remaining', 'half_seconds_div_score_diff', 'wr_to_db_ratio', 'fg_prob', 'score_differential', 'game_temp_div_game_humidity', 'game_wind_prod_game_temp', 'drive_play_count', 'posteam_timeouts_remaining', 'ydstogo', 'game_temp_prod_game_humidity', 'ep', 'drive_incomplete_pass_pcnt', 'run_ratio_def_priors', 'prev1_yards_gained', 'safety_prob', 'remain_yds_div_off_hb_count', 'def_db_count', 'quarter_seconds_remaining', 'off_hb_count', 'prev4_wpa', 'defteam_score', 'week', 'ep_half_sec_ratio', 'remain_yds_div_def_db_count', 'drive_qb_hit_pcnt', 'game_wind_div_game_temp', 'remain_yds_prod_def_db_count', 'prev3_yards_gained', 'play_sequence_game', 'prev4_yards_gained', 'play_sequence_series', '

In [12]:
# Custom transformer for IQR outlier exclusion
class IQRTransformer:
    def __init__(self, numerical_cols):
        self.numerical_cols = numerical_cols
        self.lower_bound = None
        self.upper_bound = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            # Calculate the IQR for each numerical column
            q1 = X[self.numerical_cols].quantile(0.25)
            q3 = X[self.numerical_cols].quantile(0.75)
            iqr = q3 - q1

            # Define the lower and upper bounds for outliers
            self.lower_bound = (q1 - 1.5 * iqr).to_dict()
            self.upper_bound = (q3 + 1.5 * iqr).to_dict()
        else:
            # Calculate the IQR for each numerical column
            q1 = np.quantile(X[:, :], 0.25, axis=0)
            q3 = np.quantile(X[:, :], 0.75, axis=0)
            iqr = q3 - q1

            # Define the lower and upper bounds for outliers
            self.lower_bound = (q1 - 1.5 * iqr).tolist()
            self.upper_bound = (q3 + 1.5 * iqr).tolist()

        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            # Exclude outliers based on the IQR for each numerical column
            x_outlier_removed = X.copy()
            for col in self.numerical_cols:
                if col in self.lower_bound and col in self.upper_bound:
                    x_outlier_removed = x_outlier_removed[
                        (x_outlier_removed[col] >= self.lower_bound[col]) & (x_outlier_removed[col] <= self.upper_bound[col])
                    ].dropna()
        else:
            # Exclude outliers based on the IQR for each numerical column
            x_outlier_removed = X.copy()
            for i, col in enumerate(self.numerical_cols):
                if col in self.lower_bound and col in self.upper_bound:
                    lower_bound = self.lower_bound[col]
                    upper_bound = self.upper_bound[col]
                    x_outlier_removed = x_outlier_removed[
                        (x_outlier_removed[:, i] >= lower_bound) & (x_outlier_removed[:, i] <= upper_bound)
                    ].dropna()

        return x_outlier_removed

In [None]:
# Specify the transformations per data type
num_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                            ('iqr_outlier', IQRTransformer(numerical_cols=numeric_features)),
                            ('scaler', feature_scaler),
                           ])

cat_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                            ('cat_encoder', LeaveOneOutEncoder(handle_missing='value', handle_unknown='value', sigma=sigma, random_state=67)),
                            ])

ord_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                            ('ordinal_encoder', LeaveOneOutEncoder(handle_missing='value', handle_unknown='value', sigma=sigma, random_state=67)),
                           ])

preprocessing = ColumnTransformer(transformers=[('numeric_transform', num_trans, numeric_features),
                                                ('categorical_transform', cat_trans, categorical_features),
                                                ('ordinal_transform', ord_trans, ordinal_features),
                                                ],
                                     remainder='passthrough',
                                    )

# Define the models
models = [
    ('RandomForest', RandomForestRegressor()),
    ('ExtraTrees', ExtraTreesRegressor()),
    ('GradientBoosting', GradientBoostingRegressor()),
    ('AdaBoost', AdaBoostRegressor()),
    ('XGBoost', XGBRegressor()),
]

# Create and run the pipeline
for model_name, model in models:
    pipeline = Pipeline([
        ('pre', preprocessing),
        ('select', SelectKBest()),
        ('smpl', ADASYN(sampling_strategy='not majority', random_state=67)),
        ('clf', model)
    ])
    
    params = {}
    
    if model_name == 'RandomForest':
        params = {
            'select__k': randint(10, 80),
            'smpl__n_neighbors': randint(2, 15),                       # Only for sampling_strategy='not majority
            'clf__bootstrap': [True],
            'clf__ccp_alpha': loguniform(1e-06, 1e-01),   # cost-complexity pruning is an algorithm used to prune a tree to avoid over-fitting
            'clf__criterion': ['gini','entropy'],
            'clf__max_depth': randint(5, 30),
            'clf__max_features': loguniform(0.10, 0.35), 
            'clf__min_impurity_decrease': loguniform(1e-09, 1e-04),
            #'clf__max_samples': loguniform(0.02, 0.49),              # Only for bootstrap=True
            'clf__min_samples_leaf': loguniform(0.005, 0.20),
            'clf__min_samples_split': loguniform(0.005, 0.20),
            'clf__min_weight_fraction_leaf': loguniform(0.005, 0.20),
            'clf__oob_score': [True, False],                                # Only for bootstrap=True
            'clf__warm_start': [True, False],
            'clf__n_jobs': [6],
            'clf__random_state': [67],
        }
    
    elif model_name == 'ExtraTrees':
        params = {
            'select__k': randint(10, 80),
            'smpl__n_neighbors': randint(2, 15),                       # Only for sampling_strategy='not majority
            'clf__bootstrap': [False],
            'clf__ccp_alpha': loguniform(1e-06, 1e-01),               # cost-complexity pruning is an algorithm used to prune a tree to avoid over-fitting
            'clf__criterion': ['gini','entropy'],
            'clf__max_depth': randint(5, 80),
            'clf__max_features': loguniform(0.50, 0.95),
            'clf__max_leaf_nodes': randint(20, 70),
            #'clf__max_samples': loguniform(0.10, 0.50),               # Only for bootstrap=True
            'clf__min_impurity_decrease': loguniform(1e-05, 1e-01),
            'clf__min_samples_leaf': loguniform(0.05, 0.30),
            'clf__min_samples_split': loguniform(0.005, 0.15),
            'clf__min_weight_fraction_leaf': loguniform(0.05, 0.25),
            'clf__oob_score': [False],                                # Only for bootstrap=True
            'clf__warm_start': [True, False],
            'clf__n_jobs': [6],
            'clf__random_state': [67],
        }
    
    elif model_name == 'GradientBoosting':
        params = {
            'select__k': randint(10, 80),
            'smpl__n_neighbors': randint(2, 15),                       # Only for sampling_strategy='not majority
            'clf__criterion': ['friedman_mse'],
            'clf__ccp_alpha': loguniform(1e-06, 1e-01),  # cost-complexity pruning is an algorithm used to prune a tree to avoid over-fitting
            'clf__learning_rate': loguniform(1e-05, 1e-00),
            'clf__loss': ['log_loss','exponential'],
            'clf__max_depth': randint(25, 60),
            'clf__max_features': loguniform(0.45, 0.85), 
            'clf__max_leaf_nodes': randint(20, 50),
            'clf__min_weight_fraction_leaf': loguniform(0.30, 0.50),   # Must be <= 0.5
            'clf__min_impurity_decrease': loguniform(1e-08, 1e-04),
            'clf__min_samples_leaf': loguniform(0.01, 0.25),
            'clf__min_samples_split': loguniform(0.10, 0.35),
            'clf__n_iter_no_change': [200],
            'clf__tol': loguniform(1e-08, 1e-03),
            'clf__validation_fraction': loguniform(0.05, 0.15),
            'clf__warm_start': [True, False],
            'clf__subsample': loguniform(0.65, 1.0),
            'clf__random_state': [67],
        }
        
    elif model_name == 'AdaBoost':
        params = {
            'select__k': randint(10, 80),
            'smpl__n_neighbors': randint(2, 15),                       # Only for sampling_strategy='not majority
            'clf__algorithm': ['SAMME','SAMME.R'],
            'clf__learning_rate': loguniform(1e-08, 1e-01),
            'clf__random_state': [67],
        }
    
    elif model_name == 'XGBoost':
        params = {
            'select__k': randint(10, 80),
            'smpl__n_neighbors': randint(2, 15),                       # Only for sampling_strategy='not majority
            'clf__booster': ['gbtree','dart'],
            'clf__max_depth': randint(4, 8),
            'clf__grow_policy': ['depthwise','lossguide'],
            'clf__objective': ['reg:squarederror'],
            'clf__eval_metric': ['mape'],
            'clf__seed': [67],
        }
    
    search = HalvingRandomSearchCV(
        estimator=pipeline,
        param_distributions=params,
        scoring=scoring,
        factor=halving_parameter,
        resource='clf__n_estimators',
        n_candidates='exhaust',
        min_resources=min_resource,
        max_resources=max_resource,
        aggressive_elimination=False,
        return_train_score=True,
        refit=True,
        cv=n_cross_validation,
        n_jobs=6,
        error_score='raise',
        random_state=67,
        verbose=1,
    )
    
    search.fit(X_train, y_train)
    
    print(f"Best performance for {model_name}: {-search.best_score_}")     # Note the negative sign for mean squared error
    print(f"Best parameters: {search.best_params_}")

    print("\n")


n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 250
max_resources_: 500
aggressive_elimination: False
factor: 2.0
----------
iter: 0
n_candidates: 2
n_resources: 250
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [None]:
# Calculate workbook processing time in hours
end_time = time.time()
total_time = end_time - start_time

print('Total HalvingRandomSearchCV runtime:', round(total_time / 3600, 2), 'hours')