# Grid Search 
[Predicting H1N1 Vaccination Status](https://github.com/westonshuken/h1n1-and-flu-vaccine-predictor/blob/main/index.ipynb)

Authors: Czarina Luna, Weston Shuken, Justin Sohn

In [1]:
import pandas as pd
import numpy as np

labels = pd.read_csv('data/training_set_labels.csv')
features = pd.read_csv('data/training_set_features.csv')
data = pd.merge(features, labels, on="respondent_id").drop(columns='respondent_id')

target_variable = 'h1n1_vaccine'
other_target = 'seasonal_vaccine'
data = data.drop(columns=other_target)

In [2]:
import time

from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
def grab_numeric(df):
    return df.select_dtypes(['float64', 'int64'])

GrabNumeric = FunctionTransformer(grab_numeric)

In [4]:
numeric_transformer = Pipeline(
    steps=[('grab_num', GrabNumeric), 
           ('num_impute', SimpleImputer(strategy='median', add_indicator=True)), 
           ('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
           ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [5]:
num_features = list(range(0, 21)) + [31, 32]
cat_features = list(range(21, 31)) + [33, 34]

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, num_features),
                  ("cat", categorical_transformer, cat_features)], remainder='passthrough')

In [6]:
baseline = {'LogisticRegression':
              {'classifier': LogisticRegression(random_state=20220124, max_iter=1000),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran' : None
              },
          'DecisionTree':
              {'classifier': DecisionTreeClassifier(random_state=20211122),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran': None
              },
          'RandomForest':
              {'classifier': RandomForestClassifier(random_state=10151997),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran': None
              },
          'ExtraTrees':
              {'classifier': ExtraTreesClassifier(random_state=20220125),
                'preprocessor': preprocessor,
                'output': None,
                'fit_classifier': None,
                'time_ran': None
              },
          'GradientBoost':
              {'classifier': GradientBoostingClassifier(random_state=20220126),
               'preprocessor': preprocessor,
               'output': None,
               'fit_classifier': None,
               'time_ran': None
              }
         }

In [7]:
def data_split(df=data, target=target_variable):
    X = df.drop(columns=target)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20211122)
    return X_train, X_test, y_train, y_test

In [8]:
tuned_params = {}

def run_gridsearch(params, name, models=baseline, df=data, target=target_variable):
    X_train, X_test, y_train, y_test = data_split(df, target)
    for model, grid in params.items():
        print(model, 'Grid Search:')
        print(f'Time Started: {time.asctime()}')
        pipe = Pipeline(steps=[('col_transformer', models[model]['preprocessor']), 
                               ('classifier', models[model]['classifier'])])
        gridsearch = GridSearchCV(estimator=pipe, param_grid=grid, scoring='precision', cv=5)
        gridsearch.fit(X_train, y_train)
        print(f'Time Finished: {time.asctime()}')
        print(f'Cross validation scores: {gridsearch.cv_results_["mean_test_score"]}')
        print(f'Best cross validation score: {gridsearch.best_score_ :.2%}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        tuned_params[name] = gridsearch.best_params_

#### Logistic Regression Grid Search

In [9]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}
run_gridsearch(params_lr1, name='LogisticRegression1')

LogisticRegression Grid Search:
Time Started: Tue Jan 25 20:51:42 2022
Time Finished: Tue Jan 25 20:59:57 2022
Cross validation scores: [0.77332602 0.77332602 0.74772373 0.74772373 0.71925031 0.71916672
 0.71888022 0.71861734 0.71371979 0.71372367 0.71418345 0.71392797]
Best cross validation score: 77.33%
Optimal parameters: {'classifier__C': 0.001, 'classifier__fit_intercept': True, 'classifier__solver': 'lbfgs'}


In [10]:
lr_best_params = tuned_params['LogisticRegression1']
%store lr_best_params

Stored 'lr_best_params' (dict)


#### Decision Tree Grid Search

In [11]:
params_dt1 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__splitter':['best', 'random'],
    'classifier__max_depth':[2, 6, 10, 15],
    'classifier__min_samples_split': [5, 10],
    'classifier__min_samples_leaf': [3, 6],
}]}
run_gridsearch(params_dt1, name='DecisionTree1')

DecisionTree Grid Search:
Time Started: Tue Jan 25 20:59:57 2022
Time Finished: Tue Jan 25 21:08:13 2022
Cross validation scores: [0.69209944 0.67199346 0.69209944 0.67199346 0.69209944 0.67199346
 0.69209944 0.67199346 0.69817178 0.68696053 0.69822572 0.68438469
 0.6990221  0.68339598 0.6990221  0.68339598 0.6440812  0.63539554
 0.65072552 0.65842927 0.65167811 0.65212296 0.65167811 0.65212296
 0.56625373 0.55760735 0.58655187 0.57344712 0.60218401 0.59621774
 0.60218401 0.59621774 0.69209944 0.67199346 0.69209944 0.67199346
 0.69209944 0.67199346 0.69209944 0.67199346 0.6934849  0.69443562
 0.6934849  0.6836569  0.6918169  0.68824494 0.6918169  0.68824494
 0.64120176 0.65311426 0.64325389 0.65469617 0.65419609 0.65335069
 0.65419609 0.65335069 0.56637738 0.56025256 0.58205751 0.57802871
 0.60047301 0.59532038 0.60047301 0.59532038]
Best cross validation score: 69.90%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'classifier__min_samples_leaf': 6, '

In [12]:
dt_best_params = tuned_params['DecisionTree1']
%store dt_best_params

Stored 'dt_best_params' (dict)


#### Random Forest Grid Search

In [13]:
params_rf1 = {'RandomForest': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_rf1, name='RandomForest1')

RandomForest Grid Search:
Time Started: Tue Jan 25 21:08:13 2022
Time Finished: Tue Jan 25 21:25:16 2022
Cross validation scores: [0.72834928 0.76945647 0.78667191 0.72927907 0.77673396 0.78931751
 0.74437969 0.79197061 0.7994756  0.74437969 0.79197061 0.7994756
 0.34021739 0.         0.         0.34021739 0.         0.
 0.34021739 0.         0.         0.34021739 0.         0.
 0.82585253 0.8525388  0.85711897 0.82382332 0.8502768  0.85781501
 0.80142828 0.84884995 0.85660424 0.80142828 0.84884995 0.85660424
 0.77910479 0.8073858  0.81451605 0.78164385 0.80951627 0.82229731
 0.78165751 0.80954904 0.81844962 0.78165751 0.80954904 0.81844962
 0.740037   0.77387887 0.78927022 0.75128236 0.78060724 0.79187341
 0.75816643 0.77927285 0.80074886 0.75816643 0.77927285 0.80074886
 0.16521739 0.         0.         0.16521739 0.         0.
 0.16521739 0.         0.         0.16521739 0.         0.
 0.82774048 0.84613908 0.85602812 0.824762   0.8524176  0.85560754
 0.8128994  0.85180262 0.8559094

In [14]:
params_rf2 = {'RandomForest': [{
    'classifier__max_depth': [5, 6, 8], 
    'classifier__min_samples_leaf':[1, 2, 3], 
    'classifier__min_samples_split': [10, 15], 
    'classifier__n_estimators': [100, 150]
}]}
run_gridsearch(params_rf2, name='RandomForest2')

RandomForest Grid Search:
Time Started: Tue Jan 25 21:25:16 2022
Time Finished: Tue Jan 25 21:35:03 2022
Cross validation scores: [0.86348025 0.86916877 0.8672399  0.86581484 0.85828228 0.86554586
 0.86589161 0.86438716 0.86019365 0.8665129  0.86366691 0.86765183
 0.85552997 0.86261221 0.85475675 0.86134594 0.85449185 0.85802168
 0.85270273 0.85877089 0.85781501 0.86193307 0.85666955 0.85954134
 0.82935601 0.83243864 0.82793798 0.83106339 0.82874916 0.83311377
 0.82681115 0.83609432 0.8247606  0.83442019 0.83131895 0.83922549]
Best cross validation score: 86.92%
Optimal parameters: {'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 150}


In [15]:
params_rf3 = {'RandomForest': [{
    'classifier__max_depth': [3, 4, 5], 
    'classifier__min_samples_leaf':[1], 
    'classifier__min_samples_split': [8, 10, 12], 
    'classifier__n_estimators': [150, 200, 300]
}]}
run_gridsearch(params_rf3, name='RandomForest3')

RandomForest Grid Search:
Time Started: Tue Jan 25 21:35:03 2022
Time Finished: Tue Jan 25 21:43:46 2022
Cross validation scores: [0.76       0.925      0.925      0.76       0.925      0.925
 0.76       0.925      0.925      0.94222222 0.93632279 0.93688187
 0.94088889 0.93544631 0.93941845 0.94252991 0.9368475  0.93649311
 0.86793955 0.86575031 0.8719752  0.86916877 0.86425856 0.87165903
 0.86896581 0.8657069  0.87282453]
Best cross validation score: 94.25%
Optimal parameters: {'classifier__max_depth': 4, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 12, 'classifier__n_estimators': 150}


In [16]:
params_rf4 = {'RandomForest': [{
    'classifier__max_depth': [4], 
    'classifier__min_samples_leaf':[1], 
    'classifier__min_samples_split': [10], 
    'classifier__n_estimators': [200, 225, 250]
}]}
run_gridsearch(params_rf4, name='RandomForest4')

RandomForest Grid Search:
Time Started: Wed Jan 26 07:57:38 2022
Time Finished: Wed Jan 26 07:58:42 2022
Cross validation scores: [0.93544631 0.92849587 0.93631117]
Best cross validation score: 93.63%
Optimal parameters: {'classifier__max_depth': 4, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 250}


In [20]:
rf_best_params = tuned_params['RandomForest4']
%store rf_best_params
rf_best_params

Stored 'rf_best_params' (dict)


{'classifier__max_depth': 4,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 10,
 'classifier__n_estimators': 250}

#### Extra Trees Grid Search

In [18]:
params_et1 = {'ExtraTrees': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_et1, name='ExtraTrees1')

ExtraTrees Grid Search:
Time Started: Tue Jan 25 21:44:45 2022
Time Finished: Tue Jan 25 22:01:43 2022
Cross validation scores: [0.73740968 0.77556846 0.7925857  0.74504837 0.782159   0.79571527
 0.76126053 0.79273261 0.80310892 0.76126053 0.79273261 0.80310892
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.82478932 0.87431847 0.86278711 0.80954916 0.86936681 0.87826212
 0.82737153 0.87755512 0.88941167 0.82737153 0.87755512 0.88941167
 0.78505397 0.81441699 0.82161139 0.77822105 0.81163727 0.82434729
 0.79394288 0.82157933 0.82642731 0.79394288 0.82157933 0.82642731
 0.74517303 0.77959971 0.78617052 0.74563427 0.78031464 0.79048635
 0.77092441 0.79505357 0.80671093 0.77092441 0.79505357 0.80671093
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.8425626  0.8760777  0.88039135 0.8284872  0.87258835 0.8910092
 0.82959337 0.88682954 0.89616994 

In [19]:
params_extra2 = {'ExtraTrees': [{
    'regressor__n_estimators':[100, 200, 250],
    'regressor__criterion':['gini', 'entropy'],
    'regressor__max_depth':[3, 5, 6],
    'regressor__min_samples_split':[8, 10, 15],
    'regressor__min_samples_leaf':[1, 2, 3]
    }]}
run_gridsearch(params_et1, name='ExtraTrees2')

ExtraTrees Grid Search:
Time Started: Tue Jan 25 22:01:43 2022
Time Finished: Tue Jan 25 22:18:49 2022
Cross validation scores: [0.73740968 0.77556846 0.7925857  0.74504837 0.782159   0.79571527
 0.76126053 0.79273261 0.80310892 0.76126053 0.79273261 0.80310892
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.82478932 0.87431847 0.86278711 0.80954916 0.86936681 0.87826212
 0.82737153 0.87755512 0.88941167 0.82737153 0.87755512 0.88941167
 0.78505397 0.81441699 0.82161139 0.77822105 0.81163727 0.82434729
 0.79394288 0.82157933 0.82642731 0.79394288 0.82157933 0.82642731
 0.74517303 0.77959971 0.78617052 0.74563427 0.78031464 0.79048635
 0.77092441 0.79505357 0.80671093 0.77092441 0.79505357 0.80671093
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.8425626  0.8760777  0.88039135 0.8284872  0.87258835 0.8910092
 0.82959337 0.88682954 0.89616994 

In [20]:
et_best_params = tuned_params['ExtraTrees2']
%store et_best_params

Stored 'et_best_params' (dict)


#### Gradient Boost Grid Search

In [9]:
params_gb1 = {'GradientBoost': [{
    'classifier__loss': ['deviance', 'exponential'],
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['friedman_mse', 'squared_error'],
    'classifier__max_depth':[2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_gb1, name='GradientBoost1')

GradientBoost Grid Search:
Time Started: Wed Jan 26 01:49:49 2022
Time Finished: Wed Jan 26 02:48:26 2022
Cross validation scores: [0.85553137 0.77503382 0.74857846 0.85553137 0.77503382 0.74857846
 0.85553137 0.77503382 0.74938491 0.85553137 0.77503382 0.74938491
 0.79561829 0.73242356 0.72290551 0.79514218 0.72887847 0.71640555
 0.79210031 0.73703701 0.72484727 0.79210031 0.73703701 0.72484727
 0.74241109 0.69684654 0.69142404 0.7505717  0.70381195 0.69806752
 0.76188974 0.70538981 0.6881066  0.76188974 0.70538981 0.6881066
 0.         0.78394175 0.74465963 0.         0.78394175 0.74465963
 0.         0.78394175 0.74457971 0.         0.78394175 0.74457971
 0.8342102  0.73884904 0.72459949 0.83543923 0.74203365 0.72723842
 0.83621221 0.73775579 0.72622219 0.83621221 0.73775579 0.72622219
 0.77534416 0.71536566 0.70355581 0.78102891 0.72179612 0.71220472
 0.78685156 0.71706051 0.70559187 0.78685156 0.71706051 0.70559187
        nan        nan        nan        nan        nan        nan

In [10]:
params_gb2 = {'GradientBoost': [{
    'classifier__loss': ['exponential'],
    'classifier__n_estimators':[100, 200, 300],
    'classifier__criterion':['friedman_mse'],
    'classifier__max_depth':[5, 6, 8],
    'classifier__min_samples_split':[8, 10, 13],
    'classifier__min_samples_leaf':[1, 3, 4]
}]}
run_gridsearch(params_gb2, name='GradientBoost2')

GradientBoost Grid Search:
Time Started: Wed Jan 26 02:48:26 2022


ValueError: Invalid parameter regressor for estimator Pipeline(steps=[('col_transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('grab_num',
                                                                   FunctionTransformer(func=<function grab_numeric at 0x7ff233c67b80>)),
                                                                  ('num_impute',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 31, 32]),
                                                 ('cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  [21, 22, 23, 24, 25, 26, 27,
                                                   28, 29, 30, 33, 34])])),
                ('classifier',
                 GradientBoostingClassifier(random_state=20220126))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [11]:
gb_best_params = tuned_params['GradientBoost1']
%store gb_best_params

Stored 'gb_best_params' (dict)


# Contact <a class="anchor" id="Contact"></a>

Any inquiries, please contact the contributors of this analysis:   
>[Czarina Luna](https://www.linkedin.com/in/czarinagluna)  
[Justin Sohn](https://www.linkedin.com/in/justin-sohn-689901193/)  
[Weston Shuken](https://www.linkedin.com/in/westonshuken/)