# Grid Search (AUC)
##### Authors: Czarina Luna, Weston Shuken, Justin Sohn

In [1]:
import pandas as pd
import numpy as np

labels = pd.read_csv('data/training_set_labels.csv')
features = pd.read_csv('data/training_set_features.csv')
data = pd.merge(labels, features, on="respondent_id").drop(columns=['respondent_id', 'seasonal_vaccine'])
target_variable = 'h1n1_vaccine'

In [2]:
import time

from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
def grab_numeric(df):
    return df.select_dtypes(['float64', 'int64'])

GrabNumeric = FunctionTransformer(grab_numeric)

In [4]:
numeric_transformer = Pipeline(
    steps=[('grab_num', GrabNumeric), 
           ('num_impute', SimpleImputer(strategy='median', add_indicator=True)), 
           ('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
           ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [5]:
num_features = list(range(0, 21)) + [31, 32]
cat_features = list(range(21, 31)) + [33, 34]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)], remainder='passthrough')

In [6]:
baseline = {'LogisticRegression':
              {'classifier': LogisticRegression(random_state=20220124, max_iter=1000),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran' : None
              },
          'DecisionTree':
              {'classifier': DecisionTreeClassifier(random_state=20211122),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran': None
              },
          'RandomForest':
              {'classifier': RandomForestClassifier(random_state=10151997),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran': None
              },
          'ExtraTrees':
              {'classifier': ExtraTreesClassifier(random_state=20220125),
                'preprocessor': preprocessor,
                'output': None,
                'fit_classifier': None,
                'time_ran': None
              },
          'GradientBoost':
              {'classifier': GradientBoostingClassifier(random_state=20220126),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran': None
              }
         }

In [7]:
def data_split(df=data, target=target_variable):
    X = df.drop(columns=target)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20211122)
    return X_train, X_test, y_train, y_test

In [8]:
tuned_models = {}

def run_gridsearch(params, name, models=baseline, df=data, target=target_variable, score='precision'):
    X_train, X_test, y_train, y_test = data_split(df, target)
    for model, grid in params.items():
        print(model, 'Grid Search:')
        print(f'Time Started: {time.asctime()}')
        pipe = Pipeline(steps=[('col_transformer', models[model]['preprocessor']), 
                               ('classifier', models[model]['classifier'])])
        gridsearch = GridSearchCV(estimator=pipe, param_grid=grid, scoring=score, cv=5)
        gridsearch.fit(X_train, y_train)
        print(f'Time Finished: {time.asctime()}')
        print(f'Cross validation scores: {gridsearch.cv_results_["mean_test_score"]}')
        print(f'Best cross validation score: {gridsearch.best_score_ :.2%}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        tuned_models[name] = gridsearch.best_params_

#### Logistic Regression


In [9]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1]
}]}
run_gridsearch(params_lr1, name='LogisticRegression1_auc', score='roc_auc')

LogisticRegression Grid Search:
Time Started: Wed Jan 26 09:19:01 2022
Time Finished: Wed Jan 26 09:19:16 2022
Cross validation scores: [0.85363743 0.86061299 0.8602602 ]
Best cross validation score: 86.06%
Optimal parameters: {'classifier__C': 0.1}


In [10]:
params_lr2 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}
run_gridsearch(params_lr2, name='LogisticRegression2_auc', score='roc_auc')

LogisticRegression Grid Search:
Time Started: Wed Jan 26 09:19:16 2022
Time Finished: Wed Jan 26 09:22:57 2022
Cross validation scores: [0.85363743 0.85363713 0.85203001 0.85203136 0.86061299 0.8606205
 0.86063918 0.86064038 0.8602602  0.86027003 0.86026155 0.86026657]
Best cross validation score: 86.06%
Optimal parameters: {'classifier__C': 0.1, 'classifier__fit_intercept': False, 'classifier__solver': 'saga'}


In [11]:
params_lr3 = {'LogisticRegression': [{
    'classifier__C':[0.05, 0.1, 0.15],
    'classifier__solver':['saga'],
    'classifier__fit_intercept':[False],
    'classifier__max_iter': [10000]
}]}
run_gridsearch(params_lr3, name='LogisticRegression3_auc', score='roc_auc')

LogisticRegression Grid Search:
Time Started: Wed Jan 26 09:22:57 2022
Time Finished: Wed Jan 26 09:27:10 2022
Cross validation scores: [0.86058046 0.86058166 0.86058759 0.86058249 0.86061299 0.8606205
 0.86063918 0.86064038 0.86058267 0.86058462 0.86060181 0.86060331]
Best cross validation score: 86.06%
Optimal parameters: {'classifier__C': 0.1, 'classifier__fit_intercept': False, 'classifier__max_iter': 10000, 'classifier__solver': 'saga'}


In [12]:
lr_best_params_auc = tuned_models['LogisticRegression2_auc']
%store lr_best_params_auc

Stored 'lr_best_params_auc' (dict)


#### Decision Tree Grid Search


In [13]:
params_dt1 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[2, 6, 10, 15]
}]}
run_gridsearch(params_dt1, name='DecisionTree1_auc', score='roc_auc')

DecisionTree Grid Search:
Time Started: Wed Jan 26 09:27:56 2022
Time Finished: Wed Jan 26 09:28:22 2022
Cross validation scores: [0.76406813 0.83715147 0.77845353 0.6821526  0.76406813 0.83809465
 0.78603583 0.71568881]
Best cross validation score: 83.81%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 6}


In [14]:
params_dt2 = {'DecisionTree': [{
    'classifier__criterion':['entropy'],
    'classifier__max_depth':[4, 6, 8],
    'classifier__min_samples_split': [5, 10]
}]}
run_gridsearch(params_dt2, name='DecisionTree1_auc', score='roc_auc')

DecisionTree Grid Search:
Time Started: Wed Jan 26 09:30:55 2022
Time Finished: Wed Jan 26 09:31:14 2022
Cross validation scores: [0.82483555 0.82483555 0.83809465 0.83809465 0.8186019  0.82218691]
Best cross validation score: 83.81%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 6, 'classifier__min_samples_split': 5}


In [None]:
params_dt2 = {'DecisionTree': [{
    'classifier__criterion':['entropy'],
    'classifier__max_depth':[4, 6, 8],
    'classifier__min_samples_split': [5, 10]
}]}
run_gridsearch(params_dt2, name='DecisionTree1_auc', score='roc_auc')

In [None]:
dt_best_params_auc = tuned_models['DecisionTree1_auc']
%store dt_best_params_auc

#### Random Forest Grid Search


In [None]:
params_rf1 = {'RandomForest': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_rf1, name='RandomForest1_auc', score='roc_auc')

In [None]:
params_rf2 = {'RandomForest': [{
    'classifier__max_depth': [5, 6, 8], 
    'classifier__min_samples_leaf':[1, 2, 3], 
    'classifier__min_samples_split': [10, 15], 
    'classifier__n_estimators': [100, 150]
}]}
run_gridsearch(params_rf2, name='RandomForest2_auc', score='roc_auc')

In [None]:
params_rf3 = {'RandomForest': [{
    'classifier__max_depth': [3, 4, 5], 
    'classifier__min_samples_leaf':[1], 
    'classifier__min_samples_split': [8, 10, 12], 
    'classifier__n_estimators': [150, 200, 300]
}]}
run_gridsearch(params_rf3, name='RandomForest3_auc', score='roc_auc')

In [None]:
params_rf4 = {'RandomForest': [{
    'classifier__max_depth': [4], 
    'classifier__min_samples_leaf':[1], 
    'classifier__min_samples_split': [10], 
    'classifier__n_estimators': [200, 225, 250]
}]}
run_gridsearch(params_rf4, name='RandomForest4_auc', score='roc_auc')

In [None]:
rf_best_params_auc = tuned_models['RandomForest4_auc']
%store rf_best_params_auc

#### Extra Trees Grid Search


In [None]:
params_et1 = {'ExtraTrees': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_et1, name='ExtraTrees1_auc', score='roc_auc')

In [None]:
params_extra2 = {'ExtraTrees': [{
    'regressor__n_estimators':[100, 200, 250],
    'regressor__criterion':['gini', 'entropy'],
    'regressor__max_depth':[3, 5, 6],
    'regressor__min_samples_split':[8, 10, 15],
    'regressor__min_samples_leaf':[1, 2, 3]
    }]}
run_gridsearch(params_et1, name='ExtraTrees2_auc', score='roc_auc')

In [None]:
et_best_params_auc = tuned_models['ExtraTrees2_auc']
%store et_best_params_auc

#### Gradient Boost Grid Search


In [None]:
params_gb1 = {'GradientBoost': [{
    'classifier__loss': ['deviance', 'exponential'],
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['friedman_mse', 'squared_error'],
    'classifier__max_depth':[2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_gb1, name='GradientBoost1_auc', score='roc_auc')

In [None]:
params_gb2 = {'GradientBoost': [{
    'regressor__loss': ['exponential'],
    'regressor__n_estimators':[100, 200, 300],
    'regressor__criterion':['friedman_mse'],
    'regressor__max_depth':[5, 6, 8],
    'regressor__min_samples_split':[8, 10, 13],
    'regressor__min_samples_leaf':[1, 3, 4]
}]}
run_gridsearch(params_gb2, name='GradientBoost2_auc', score='roc_auc')

In [None]:
gb_best_params_auc = tuned_models['GradientBoost1_auc']
%store gb_best_params_auc

# Contact <a class="anchor" id="Contact"></a>

Any inquiries, please contact the contributors of this analysis:   
>[Czarina Luna](https://www.linkedin.com/in/czarinagluna)  
[Justin Sohn](https://www.linkedin.com/in/justin-sohn-689901193/)  
[Weston Shuken](https://www.linkedin.com/in/westonshuken/)