In [1]:
import pandas as pd
import numpy as np                 # maintain version 1.22.0
import time as time_calc
from time import time

from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.utils.fixes import loguniform
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from scipy.stats import randint
import joblib
import operator

## Import data from nfl-data-py
##### https://pypi.org/project/nfl-data-py/

In [2]:
# Read csv with 5-years of nfl play-by-play data (2020-2021)
data = pd.read_csv('nfl_post_processing_draft_data.csv')
df = pd.DataFrame(data)

df.shape

(4486, 29)

In [3]:
df['games_pcnt'] = df['games'] / (df['nfl_years'] * 16)
df['games_pcnt'].fillna(0, inplace = True)

In [4]:
df.drop(['pfr_player_id','player_name','category','position','side','college','hof','allpro',
         'pro_bowls','seasons_started','combine','comb_name','comb_school','nfl_years','games',
         'draft','round'], axis=1, inplace=True)

In [5]:
# Convert binary columns to integers
binary_columns = df.columns[df.isin([0,1]).all()].tolist()
df[binary_columns] = df[binary_columns].apply(pd.to_numeric, downcast='integer', errors='coerce', axis=1)

df.sample(2).style

Unnamed: 0,pick,draft_team,age,comb_pos,comb_ht,comb_wt,comb_forty,comb_bench,comb_vert,comb_broad,comb_cone,comb_shut,games_pcnt
1749,22,MIN,21.0,WR,61.0,192.0,4.39,19.0,36.0,122.0,6.94,4.2,0.585938
3397,199,CIN,22.0,WR,75.0,205.0,4.47,14.0,31.5,119.0,6.94,4.2,0.796875


## Statistical distribution

In [6]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pick,4486.0,112.541239,70.377165,1.0,52.0,106.0,168.0,260.0
age,4486.0,22.452073,0.914298,20.0,22.0,22.0,23.0,29.0
comb_ht,4486.0,72.177441,6.780501,0.0,72.0,74.0,76.0,81.0
comb_wt,4486.0,245.440259,45.349512,155.0,207.0,237.0,288.0,375.0
comb_forty,4486.0,4.745265,0.297886,4.22,4.51,4.66,4.93,5.99
comb_bench,4486.0,21.077686,5.915244,2.0,16.0,21.0,25.0,49.0
comb_vert,4486.0,33.336781,3.900197,19.5,31.0,33.5,36.0,46.0
comb_broad,4486.0,115.39679,8.7705,82.0,110.0,117.0,122.0,147.0
comb_cone,4486.0,7.223107,0.364993,6.44,6.94,7.1,7.445,9.0
comb_shut,4486.0,4.357882,0.233378,3.73,4.19,4.3,4.49,5.38


## Train test split data
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [7]:
# split data into target and feature datasets
X, y = df.loc[:, df.columns != 'games_pcnt'], df['games_pcnt']

# initial_features = df.drop(['play_type'], axis=1)
initial_features = X.columns.to_list()

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=67)

print(X_train.shape[1], 'features before criteria evaluation')

12 features before criteria evaluation


## Baseline model
##### https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html

In [8]:
# Create and fit baseline model to compare performance
baseline_model = DummyRegressor(strategy='mean')
baseline_model.fit(X_train, y_train)

# Calculate model accuracy on test data
y_baseline_pred = baseline_model.predict(X_test)
print(f"Baseline mean squared error: {round(mean_squared_error(y_test,y_baseline_pred),10)}")
print(f"Baseline R-squared: {round(r2_score(y_test,y_baseline_pred),10)}")

Baseline mean squared error: 0.0851743661
Baseline R-squared: -0.0022207881


## Model pipeline 
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingRandomSearchCV.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
##### https://www.statsmodels.org/dev/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
##### https://imbalanced-learn.org/stable/references/over_sampling.html

In [9]:
# Create list of features for each dtype
categorical_features = list(X_train.select_dtypes(include='object')) # 'drive_start','surface'
ordinal_features = X_train.columns[X_train.isin([1, 2, 3, 4, 5, 6]).all()].tolist() # 'qtr', 'down'
boolean_features = X_train.columns[X_train.isin([0, 1]).all()].tolist()

# Create list of float features
numeric_features = [x for x in X_train.columns if x not in boolean_features]
numeric_features = [x for x in numeric_features if x not in categorical_features]
numeric_features = [x for x in numeric_features if x not in ordinal_features]

print('categorical features:', len(categorical_features))
print('ordinal features:', len(ordinal_features))
print('numeric features:', len(numeric_features))
print('boolean features:', len(boolean_features))
print('total features:', len(X_train.columns))

categorical features: 2
ordinal features: 0
numeric features: 10
boolean features: 0
total features: 12


In [10]:
# Outlier removal
def IQR_Outliers(X, features):

    indices = [x for x in X.index]
    out_index_list = []
        
    for col in features:
        # Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(X[col], 10.)
        Q3 = np.nanpercentile(X[col], 90.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
                
        outliers_index = X[col][(X[col] < lower) | (X[col] > upper)].index.tolist()
        outliers = X[col][(X[col] < lower) | (X[col] > upper)].values
        
        out_index_list.extend(outliers_index)
        
    # Use set to remove duplicates
    out_index_list = list(set(out_index_list))
    out_index_list.sort()

In [11]:
# Specify the transformations per data type
num_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                            ('iqr_outlier', IQR_Outliers(X_train, numeric_features)),
                            ('power_trans', PowerTransformer(method='yeo-johnson', copy=False)),
                            ('std_scaler', StandardScaler()),
                           ])

cat_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                            ('onehot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')),
                           ])

ord_trans = Pipeline(steps=[('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                            ('ordinal_encoder', OrdinalEncoder()),
                           ])
                                
Column_Tranform = ColumnTransformer(transformers=[('numeric_transform', num_trans, numeric_features),
                                                  ('categorical_transform', cat_trans, categorical_features),
                                                  ('ordinal_transfrom', ord_trans, ordinal_features),
                                                  ], remainder='passthrough')

In [12]:
# Specify tree model used for feature selection
feature_model = RandomForestRegressor(n_estimators=500,
                                      criterion='squared_error',
                                      n_jobs=-1,
                                      random_state=67,
                                     )

# feature selection model used in the HalvingRandomSearchCV pipeline
Feature_Selector = RFE(estimator=feature_model)

In [13]:
# Specify number of target classes
n_classes = y_train.nunique()

# Specify HalvingRandomSearchCV halving parameter
halving_parameter = 2.0

# Specify the HalvingRandomSearchCV minimum/maximun resources
max_resource = 3000
resource_divisor = 2.0
min_resource = int(round((max_resource / resource_divisor), 0))

In [14]:
def random_search():
    
    pipeline1 = Pipeline([
    ('col', Column_Tranform),
    ('feat', Feature_Selector),
    ('reg', RandomForestRegressor()),
    ])
    
    pipeline2 = Pipeline([
    ('col', Column_Tranform),
    ('feat', Feature_Selector),
    ('reg', ExtraTreesRegressor()),
    ])
    
    pipeline3 = Pipeline([
    ('col', Column_Tranform),
    ('feat', Feature_Selector),
    ('reg', GradientBoostingRegressor()),
    ])
    
    # RandomForestRegressor
    parameters1 = {
    'feat__n_features_to_select': loguniform(0.20, 1.0),
    'feat__step': randint(2, 10),
    'reg__criterion': ['squared_error'],
    'reg__max_features': ['sqrt', 'log2', None],
    'reg__max_depth': [None],
    'reg__min_samples_split': randint(2, 100),
    'reg__min_samples_leaf': randint(2, 100),
    'reg__min_impurity_decrease': loguniform(1e-09, 1e-01),
    'reg__min_weight_fraction_leaf':  loguniform(1e-08, 1e-01),
    'reg__ccp_alpha':  loguniform(1e-09, 1e-01),
    'reg__bootstrap': [True, False],
    'reg__oob_score': [False],
    'reg__warm_start': [True, False],
    'reg__n_jobs': [6],
    'reg__random_state': [67],
    }
    
    # ExtraTreesRegressor
    parameters2 = {
    'feat__n_features_to_select': loguniform(0.20, 1.0),
    'feat__step': randint(2, 10),
    'reg__criterion': ['squared_error'],
    'reg__max_depth': [None],
    'reg__max_features': ['sqrt', 'log2', None],
    'reg__max_leaf_nodes': [None],
    'reg__max_samples': [None],
    'reg__min_samples_split': randint(2, 100),
    'reg__min_samples_leaf': randint(2, 100),
    'reg__min_weight_fraction_leaf': loguniform(1e-09, 1e-02),
    'reg__min_impurity_decrease': loguniform(1e-09, 1e-02),
    'reg__ccp_alpha': loguniform(1e-09, 1e-02),
    'reg__bootstrap': [True, False],
    'reg__oob_score': [False],
    'reg__warm_start': [True, False],
    'reg__n_jobs': [6],
    'reg__random_state': [67],
    }
            
    # GradientBoostingRegressor
    parameters3 = {
    'feat__n_features_to_select': loguniform(0.20, 1.00),
    'feat__step': randint(2, 10),
    'reg__loss': ['squared_error', 'absolute_error', 'huber'],
    'reg__max_features': ['sqrt', 'log2', None],  
    'reg__learning_rate': loguniform(1e-09, 1e-01),
    'reg__subsample': loguniform(0.01, 1.0),
    'reg__criterion': ['friedman_mse'],
    'reg__alpha': loguniform(0.01, 1.0),
    'reg__ccp_alpha': loguniform(1e-09, 1e-02),
    'reg__max_depth': randint(5, 30),
    'reg__max_leaf_nodes': randint(1, 120),
    'reg__min_samples_split': randint(10, 120), 
    'reg__min_impurity_decrease': loguniform(1e-09, 1e-02),
    'reg__min_samples_leaf': randint(2, 100),
    'reg__n_iter_no_change': [100, 125, 150, 175, 200, None],
    'reg__tol': loguniform(1e-09, 1e-02),
    'reg__validation_fraction': loguniform(0.05, 0.30),
    'reg__warm_start': [True, False],
    'reg__random_state': [67],
    }

    pars = [parameters1, parameters2, parameters3]
    pips = [pipeline1, pipeline2, pipeline3]

    result = []
    
    for i in range(len(pars)):
        
        rs = HalvingRandomSearchCV(pips[i],
                                   pars[i],
                                   factor=halving_parameter,
                                   resource='reg__n_estimators',
                                   n_candidates='exhaust',
                                   min_resources=min_resource,
                                   max_resources=max_resource,
                                   scoring='neg_mean_squared_error',
                                   aggressive_elimination=False,
                                   return_train_score=True,
                                   refit=True,
                                   cv=5,
                                   n_jobs=6,
                                   verbose=1,
                                   random_state=67,
                                   error_score='raise',
                                  )
        
        start = time()
        
        # Fit models on training data
        rs = rs.fit(X_train, y_train)
        
        # Apply models to test data to determine model performance
        y_pred = rs.predict(X_test)

        print("HalvingRandomSearchCV required %.2f minutes to complete search" % ((time() - start)/ 60))
        print(" ")
        print(" ")
        
        # storing model results
        result.append({
        'grid': rs,
        'cv results': rs.cv_results_,
        'train score': rs.best_score_,
        'best params': rs.best_params_, 
        'best estimator': rs.best_estimator_,
        'feature importances': rs.best_estimator_.named_steps['reg'].feature_importances_,
        'selected feature count': rs.best_estimator_.named_steps['feat'].n_features_,
        'selected features alt': rs.best_estimator_.named_steps['feat'].get_feature_names_out(),
        'test score': mean_squared_error(y_test, y_pred),
        'test score alt': r2_score(y_test, y_pred),
        'cv': rs.cv,
        'model #': i + 1
        })

    # sorting results by best test score
    result = sorted(result, key=operator.itemgetter('test score'), reverse=False)
    
    print('Best Models:')
    print(' ')
    for element in result:
        if element['model #']==1:
            print('RandomForest Regressor: ')
        elif element['model #']==2:
            print('ExtraTrees Regressor: ')
        elif element['model #']==3:
            print('GradientBoosting Regressor: ')
        else:
            print('Other Regressor: ')  
        print('Parameters:  ' + str(element['best params']))
        print(' ')
        print('Candidate features:', initial_features)
        print('')
        print(str(element['selected feature count']) + ' features selected during evaluation')
        print('Features:  ' + str(element['selected features alt']))
        print(' ')
        print('Train mean squared error: ' + str(element['train score']))
        print('Test mean squared error:  ' + str(element['test score']))
        print('Test r2 error:            ' + str(element['test score alt']))

        # Print most significant features per model
        f_list = []
        total_importance = 0
        included_feats = []
        
        for f in zip(initial_features, 
                     rs.best_estimator_.named_steps['feat'].get_feature_names_out(),
                     rs.best_estimator_.named_steps['reg'].feature_importances_):
            f_list.append(f)
            total_importance += f[2]
            
        # Print the name and gini importance of each feature
        for f in zip(initial_features, 
                     rs.best_estimator_.named_steps['feat'].get_feature_names_out(),
                     rs.best_estimator_.named_steps['reg'].feature_importances_):
            if f[2] > .01:
                included_feats.append(f[0])
                
        print('\n',"Cumulative Importance =", total_importance)
        
        df2 = pd.DataFrame(f_list, columns=['feat','index','importance']).sort_values(by='importance',
                                                                                      ascending=False)
        df2['cum_sum'] = df2['importance'].cumsum()
        print(df2.head(20))
        print(' ')
        print(' ')
        print(' ')
        
    # Save best model as pickle file
    joblib.dump(rs.best_params_, 'nfl_draft_prediction_results.pkl', compress = 1)

In [15]:
# Define start time of this stage in the process
start = time_calc.time()

In [16]:
%%capture --no-stdout --no-display
random_search()

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 1500
max_resources_: 3000
aggressive_elimination: False
factor: 2.0
----------
iter: 0
n_candidates: 2
n_resources: 1500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
----------
iter: 1
n_candidates: 1
n_resources: 3000
Fitting 5 folds for each of 1 candidates, totalling 5 fits
HalvingRandomSearchCV required 2.16 minutes to complete search
 
 
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 1500
max_resources_: 3000
aggressive_elimination: False
factor: 2.0
----------
iter: 0
n_candidates: 2
n_resources: 1500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
----------
iter: 1
n_candidates: 1
n_resources: 3000
Fitting 5 folds for each of 1 candidates, totalling 5 fits
HalvingRandomSearchCV required 1.96 minutes to complete search
 
 
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 1500
max_resources_: 3000
aggressi

In [17]:
# Define end time for process and calculate total time elapsed
end = time_calc.time()
print(round((end - start)/3600, 2), 'hours to complete hyperparameter tuning process')

0.09 hours to complete hyperparameter tuning process
