In [1]:
#Import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

import numpy as np

import sklearn as sks
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVR   
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [2]:
#Input file path for test and train data
train_data_file = "Resources/19332020Main.csv"
test_data_file = "Resources/2021Players.csv"

#Load the test and train data into dataframes
train_df = pd.read_csv(train_data_file, encoding='ISO-8859-1')
test_df = pd.read_csv(test_data_file, encoding='ISO-8859-1')


In [3]:
test_df.head()

Unnamed: 0,Rk,Player,AB,Year,Age,Tm,Lg,G,PA,R,...,SF,GDP,SB,CS,BA,OBP,SLG,OPS,Pos,All Star
0,1,Whit Merrifield,664,2021,32,KCR,AL,162,720,97,...,12,12,40,4,0.277,0.317,0.395,0.711,*49/7D,1
1,2,Marcus Semien,652,2021,30,TOR,AL,162,724,115,...,3,9,15,1,0.265,0.334,0.538,0.873,*46/D,1
2,3,Bo Bichette,640,2021,23,TOR,AL,159,690,121,...,4,10,25,1,0.298,0.344,0.484,0.828,*6D,1
3,4,Ozzie Albies,629,2021,24,ATL,NL,156,686,103,...,7,4,20,4,0.259,0.311,0.488,0.799,*4,1
4,5,Jonathan Schoop,623,2021,29,DET,AL,156,674,85,...,8,15,2,0,0.278,0.321,0.435,0.756,*34D/5,1


In [4]:
train_df.head()

Unnamed: 0,Rk,Player,AB,Year,Age,Tm,Lg,G,PA,R,...,SF,GDP,SB,CS,BA,OBP,SLG,OPS,Pos,All Star
0,1,Juan Soto,154,2020,21,WSN,NL,47,196,39,...,0.0,1.0,6,2,0.351,0.49,0.695,1.185,*7/9D,1
1,2,Nelson Cruz,185,2020,39,MIN,AL,53,214,33,...,0.0,8.0,0,0,0.303,0.397,0.595,0.992,*D/H,1
2,3,DJ LeMahieu,195,2020,31,NYY,AL,50,216,41,...,1.0,3.0,3,0,0.364,0.421,0.59,1.011,*435/HD,1
3,4,Mike Trout,199,2020,28,LAA,AL,53,241,41,...,4.0,1.0,1,1,0.281,0.39,0.603,0.993,*8/D,1
4,5,Freddie Freeman,214,2020,30,ATL,NL,60,262,51,...,0.0,6.0,2,0,0.341,0.462,0.64,1.102,*3/HD,1


In [5]:
#Check to see if there are any empty or null values in the test data
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 533 entries, 0 to 532
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Rk        533 non-null    int64  
 1   Player    533 non-null    object 
 2   AB        533 non-null    int64  
 3   Year      533 non-null    int64  
 4   Age       533 non-null    int64  
 5   Tm        533 non-null    object 
 6   Lg        533 non-null    object 
 7   G         533 non-null    int64  
 8   PA        533 non-null    int64  
 9   R         533 non-null    int64  
 10  H         533 non-null    int64  
 11  2B        533 non-null    int64  
 12  3B        533 non-null    int64  
 13  HR        533 non-null    int64  
 14  RBI       533 non-null    int64  
 15  BB        533 non-null    int64  
 16  IBB       533 non-null    int64  
 17  SO        533 non-null    int64  
 18  HBP       533 non-null    int64  
 19  SH        533 non-null    int64  
 20  SF        533 non-null    int64 

In [6]:
#Check to see if there are any empty or null values in the train data
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3254 entries, 0 to 3253
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Rk        3254 non-null   int64  
 1   Player    3254 non-null   object 
 2   AB        3254 non-null   int64  
 3   Year      3254 non-null   int64  
 4   Age       3254 non-null   int64  
 5   Tm        3254 non-null   object 
 6   Lg        3254 non-null   object 
 7   G         3254 non-null   int64  
 8   PA        3254 non-null   int64  
 9   R         3254 non-null   int64  
 10  H         3254 non-null   int64  
 11  2B        3254 non-null   int64  
 12  3B        3254 non-null   int64  
 13  HR        3254 non-null   int64  
 14  RBI       3254 non-null   int64  
 15  BB        3254 non-null   int64  
 16  IBB       3254 non-null   int64  
 17  SO        3254 non-null   int64  
 18  HBP       3254 non-null   int64  
 19  SH        3254 non-null   int64  
 20  SF        2589 non-null   floa

In [7]:
#Fill all the emapty values with 0
train_df = train_df.fillna(value=0)

In [8]:
#Check to see if the empty or null values are filled with 0s
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3254 entries, 0 to 3253
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Rk        3254 non-null   int64  
 1   Player    3254 non-null   object 
 2   AB        3254 non-null   int64  
 3   Year      3254 non-null   int64  
 4   Age       3254 non-null   int64  
 5   Tm        3254 non-null   object 
 6   Lg        3254 non-null   object 
 7   G         3254 non-null   int64  
 8   PA        3254 non-null   int64  
 9   R         3254 non-null   int64  
 10  H         3254 non-null   int64  
 11  2B        3254 non-null   int64  
 12  3B        3254 non-null   int64  
 13  HR        3254 non-null   int64  
 14  RBI       3254 non-null   int64  
 15  BB        3254 non-null   int64  
 16  IBB       3254 non-null   int64  
 17  SO        3254 non-null   int64  
 18  HBP       3254 non-null   int64  
 19  SH        3254 non-null   int64  
 20  SF        3254 non-null   floa

In [9]:
#Check to see if there any NaN values
train_df.isna()

Unnamed: 0,Rk,Player,AB,Year,Age,Tm,Lg,G,PA,R,...,SF,GDP,SB,CS,BA,OBP,SLG,OPS,Pos,All Star
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3249,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3250,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3251,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3252,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
#drop the insignificant columns from the test dataset
test_df.drop(columns=["Player", "Tm", "Pos", "Lg", "Year"], axis = 1, inplace=True)
test_df.head()

Unnamed: 0,Rk,AB,Age,G,PA,R,H,2B,3B,HR,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,All Star
0,1,664,32,162,720,97,184,42,3,10,...,0,12,12,40,4,0.277,0.317,0.395,0.711,1
1,2,652,30,162,724,115,173,39,2,45,...,0,3,9,15,1,0.265,0.334,0.538,0.873,1
2,3,640,23,159,690,121,191,30,1,29,...,0,4,10,25,1,0.298,0.344,0.484,0.828,1
3,4,629,24,156,686,103,163,40,7,30,...,0,7,4,20,4,0.259,0.311,0.488,0.799,1
4,5,623,29,156,674,85,173,30,1,22,...,0,8,15,2,0,0.278,0.321,0.435,0.756,1


In [11]:
#drop the insignificant columns from the test dataset
train_df.drop(columns=["Player", "Tm", "Pos", "Lg", "Year"], axis = 1, inplace=True)

train_df.head()
#train_df["GDP"].hist()

Unnamed: 0,Rk,AB,Age,G,PA,R,H,2B,3B,HR,...,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,All Star
0,1,154,21,47,196,39,54,14,0,13,...,0,0.0,1.0,6,2,0.351,0.49,0.695,1.185,1
1,2,185,39,53,214,33,56,6,0,16,...,0,0.0,8.0,0,0,0.303,0.397,0.595,0.992,1
2,3,195,31,50,216,41,71,10,2,10,...,0,1.0,3.0,3,0,0.364,0.421,0.59,1.011,1
3,4,199,28,53,241,41,56,9,2,17,...,0,4.0,1.0,1,1,0.281,0.39,0.603,0.993,1
4,5,214,30,60,262,51,73,23,1,13,...,0,0.0,6.0,2,0,0.341,0.462,0.64,1.102,1


In [12]:
#Rename the All Star column to Classification in both test and train dataset
train_df.rename(columns={"All Star":"Classification"}, inplace=True)
test_df.rename(columns={"All Star":"Classification"}, inplace=True)
print(train_df.columns)
print(test_df.columns)

Index(['Rk', 'AB', 'Age', 'G', 'PA', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB',
       'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG',
       'OPS', 'Classification'],
      dtype='object')
Index(['Rk', 'AB', 'Age', 'G', 'PA', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB',
       'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG',
       'OPS', 'Classification'],
      dtype='object')


In [13]:
#Function to execute the model that is passed in as a parameter along with the test and train data
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    #print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    #print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')

    print(f'Train score: {roc_auc_score(y_train, reg.predict_proba(X_train_scaled)[:,1])}')
    print(f'Test Score: {roc_auc_score(y_test, reg.predict_proba(X_test_scaled)[:,1])}\n')                   
    print(f'Classification Score: {classification_report(y_test, reg.predict(X_test_scaled))}\n')
    return reg

In [14]:
#Create the classifier and drop the column from the train dataset
X_train = train_df.drop(columns = ['Classification'])
y_train = train_df['Classification']

#Create the classifier and drop the column from the test dataset
X_test = test_df.drop(columns = ['Classification'])
y_test = test_df['Classification']

#Print the columns to check if the columns are dropped as expected
print(X_train.columns)
print(X_test.columns)

Index(['Rk', 'AB', 'Age', 'G', 'PA', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB',
       'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG',
       'OPS'],
      dtype='object')
Index(['Rk', 'AB', 'Age', 'G', 'PA', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB',
       'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG',
       'OPS'],
      dtype='object')


In [15]:
#Check to see the distribution of the classification values
y_train.value_counts()

0    1800
1    1454
Name: Classification, dtype: int64

In [16]:
#Scale the train and test data
train_scaler = StandardScaler()
train_scaler = train_scaler.fit(X_train)

X_train_scaled = train_scaler.transform(X_train)
X_test_scaled = train_scaler.transform(X_test)
scaled_data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [28]:

#Execute the models with the scaled data
rfc = test_model(RandomForestClassifier(), scaled_data)
lr = test_model(LogisticRegression(), scaled_data)
abc = test_model(AdaBoostClassifier(), scaled_data)

Model: RandomForestClassifier
Train score: 1.0
Test Score: 0.7586858137510879

Classification Score:               precision    recall  f1-score   support

           0       0.79      0.92      0.85       383
           1       0.65      0.37      0.47       150

    accuracy                           0.77       533
   macro avg       0.72      0.65      0.66       533
weighted avg       0.75      0.77      0.74       533


Model: LogisticRegression
Train score: 0.8039064649243466
Test Score: 0.6320104438642298

Classification Score:               precision    recall  f1-score   support

           0       0.78      0.75      0.76       383
           1       0.41      0.45      0.43       150

    accuracy                           0.66       533
   macro avg       0.59      0.60      0.60       533
weighted avg       0.67      0.66      0.67       533




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: AdaBoostClassifier
Train score: 0.97281484028733
Test Score: 0.840574412532637

Classification Score:               precision    recall  f1-score   support

           0       0.86      0.85      0.86       383
           1       0.64      0.65      0.64       150

    accuracy                           0.80       533
   macro avg       0.75      0.75      0.75       533
weighted avg       0.80      0.80      0.80       533




In [18]:
#Check the keys available in metrics to be used in RandomizedSearchCV and GridSearchCV
sks.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [19]:
#Define the parameters for the RandomizedSearchCV
params_grid = {'criterion' : ['gini','entropy'],
               'max_depth': [3,6,8,15],
               'max_leaf_nodes':[2,10,20,40,100],
               'max_features':['auto','log2'],
               'max_samples':[0.1,0.4,0.8,1],
               'min_samples_leaf': [5,50,100,200,400,1000],
               'min_samples_split': [5,10,100,1000,2000,3000],
               'n_estimators': [10,50,100,500]}

#Initialize the RandomizedSearchCV model with the params and scoring key as 'roc_auc'
rf_grid_new = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1,random_state=42),
                                 params_grid, n_iter= 400,  cv = 3, verbose=2, n_jobs = -1,
                                 scoring='roc_auc', return_train_score=True)

#Fit the model
search = rf_grid_new.fit(X_train_scaled, y_train)

#Print the best params 
search.best_params_

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.4min finished


{'n_estimators': 50,
 'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_samples': 0.8,
 'max_leaf_nodes': 100,
 'max_features': 'log2',
 'max_depth': 15,
 'criterion': 'gini'}

In [20]:
#Determine the best_estimator using in the predict_proba 
probability_scores = rf_grid_new.best_estimator_.predict_proba(X_test_scaled)[:,1]

In [21]:
#Determine and print the roc_auc_score for RandomizedSearchCV
roc_auc_score(y_test, probability_scores)

0.6715230635335074

In [22]:

#Initialize the logistic regression model
lr = LogisticRegression(n_jobs=-1, random_state=42)

#Define the parameters
grid_values = {'penalty': ['l1','none', 'l2'], 'C': [0.001,0.01,0.1,1,10,100], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

#Initialize the GridSearchCV model with the params and scoring key as 'roc_auc'
model_lr = GridSearchCV(lr, param_grid=grid_values, 
                        cv = 2, verbose=3, n_jobs = -1, 
                        scoring='roc_auc',
                        return_train_score=True)
#Fit the model
search = model_lr.fit(X_train_scaled, y_train)
#Print the best_params
search.best_params_

Fitting 2 folds for each of 90 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 157 out of 180 | elapsed:    0.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    6.3s finished


{'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}

In [23]:
#Determine the best_estimator using in the predict_proba 
probability_scores_lr = model_lr.best_estimator_.predict_proba(X_test_scaled)[:,1]
#Determine and print the roc_auc_score for GridSearchCV
roc_auc_score(y_test, probability_scores_lr)

0.6661792863359441

In [24]:
#scaled_df = pd.DataFrame(X_test_scaled)


#probability_scores_ind = model_lr.best_estimator_.predict_proba(scaled_df.iloc[:1,:])[:,1]
#probability_scores_ind

array([0.61254089])

In [None]:
#import pickle
#with open('lr.pkl', 'wb') as model:
#    pickle.dump(lr, model)
#with open('lr.pkl','rb') as model:
#    lr = pickle.load(model)
#https://github.com/ronaldpacheco/Model_Deployment

In [29]:
#Save the model, based on the test and train it was determined that the AdaBoostClassifier model had the best score

import pickle as pkl

with open('abc.pkl', 'wb') as model:
    pkl.dump(abc, model)