# Grid Search 
[Predicting H1N1 Vaccination Status](https://github.com/westonshuken/h1n1-and-flu-vaccine-predictor/blob/main/index.ipynb)

Authors: Czarina Luna, Weston Shuken, Justin Sohn

In [5]:
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
labels = pd.read_csv('data/training_set_labels.csv')
features = pd.read_csv('data/training_set_features.csv')
data = pd.merge(features, labels, on="respondent_id").drop(columns='respondent_id')

data['health_insurance'] = data['health_insurance'].fillna(3)
data[['employment_industry', 'employment_occupation']] = data[['employment_industry', 'employment_occupation']].fillna('Unknown')

target_variable = 'h1n1_vaccine'
other_target = 'seasonal_vaccine'

In [3]:
def split_train_test(df=data, target=target_variable):
    X = df.drop(columns=[target, other_target])
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20211122, stratify=y)
    return X_train, X_test, y_train, y_test

In [6]:
def grab_numeric(df):
    return df.select_dtypes(['float64', 'int64'])

GrabNumeric = FunctionTransformer(grab_numeric)

numeric_transformer = Pipeline(
    steps=[('grab_num', GrabNumeric), 
           ('num_impute', SimpleImputer(strategy='median', add_indicator=True)), 
           ('scaler', StandardScaler())])

num_features = list(range(0, 21)) + [31, 32]
cat_features = list(range(21, 31)) + [33, 34]

categorical_transformer = Pipeline(
    steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
           ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, num_features),
                  ("cat", categorical_transformer, cat_features)], remainder='passthrough')

sm = SMOTE(sampling_strategy='minority', random_state=20211122)

In [7]:
def run_model(dict_model, df=data, target=target_variable):
    """
    Run model
    """
    X_train, X_test, y_train, y_test = split_train_test(df, target)
    
    metric_table = pd.DataFrame(columns=['Model', 'CV Score', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC Score'])
    
    for name, model in dict_model.items():
        print(f'Running... {name} Model:')
        pipeline = ImPipeline(steps=[('preprocessor', preprocessor), 
                                     ('smote', sm),
                                     ('classifier', model['classifier'])])
        
        pipeline.fit(X_train, y_train)
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
        cv_scores_mean = np.mean(cv_scores)
        cv_scores_median = np.median(cv_scores)
        
        y_pred = pipeline.predict(X_test)
        
        accuracy = round(accuracy_score(y_test, y_pred), 4)
        precision = round(precision_score(y_test, y_pred), 4)
        recall = round(recall_score(y_test, y_pred), 4)
        f1 = round(f1_score(y_test, y_pred), 4)
        auc_score = round(roc_auc_score(y_test, y_pred), 4)
        
        print(f'Cross Validation Scores: {cv_scores}.')
        print(f'Mean Cross Validation Score: {cv_scores_mean}.')
        print(f'Median Cross Validation Score: {cv_scores_median}.')
        print(f'Test Accuracy Score: {cv_scores}.\n')
        
        metric_table = metric_table.append({'Model': name, 'CV Score': cv_scores_median, 
                                            'Accuracy': accuracy, 'Precision': precision, 
                                            'Recall': recall, 'F1 Score': f1,
                                            'AUC Score': auc_score}, ignore_index=True) 
        
    return metric_table

In [17]:
tuned_params = {}

baseline_models = {'LogisticRegression': {'classifier': LogisticRegression(random_state=20211122, max_iter=1000)},
                   'DecisionTree': {'classifier': DecisionTreeClassifier(random_state=20211122)},
                   'RandomForest': {'classifier': RandomForestClassifier(random_state=20211122)},
                   'ExtraTrees': {'classifier': ExtraTreesClassifier(random_state=20211122)},
                   'GradientBoost': {'classifier': GradientBoostingClassifier(random_state=20211122)}}

def run_gridsearch(params, name, models=baseline_models, df=data, target=target_variable):
    """
    Run GridSearchCV
    """
    X_train, X_test, y_train, y_test = split_train_test(df, target)
    
    for model, grid in params.items():
        print(f'Running... {model} Grid Search:')
        print(f'Time Started: {time.asctime()}')
        
        pipeline = ImPipeline(steps=[('preprocessor', preprocessor), 
                                     ('smote', sm),
                                     ('classifier', models[model]['classifier'])])
        
        gridsearch = GridSearchCV(estimator=pipeline, param_grid=grid, scoring='accuracy', cv=5, error_score='raise')
        gridsearch.fit(X_train, y_train)
        
        print(f'Time Finished: {time.asctime()}\n')
        print(f'Mean Cross Validation Score: {np.mean(gridsearch.cv_results_["mean_test_score"])}')
        print(f'Median Cross Validation Score: {np.median(gridsearch.cv_results_["mean_test_score"])}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        
        tuned_params[name] = gridsearch.best_params_

#### Logistic Regression Grid Search

In [18]:
params_lr1 = {'LogisticRegression': [{
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]
}]}
run_gridsearch(params_lr1, name='LogisticRegression1')

Running... LogisticRegression Grid Search:
Time Started: Wed Jan 26 22:44:52 2022
Time Finished: Wed Jan 26 22:55:58 2022

Mean Cross Validation Score: 0.7875811283075387
Median Cross Validation Score: 0.7884173739390914
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'lbfgs'}


In [19]:
params_lr2 = {'LogisticRegression': [{
    'classifier__C':[1, 1e10, 1e100],
    'classifier__solver':['lbfgs'],
    'classifier__fit_intercept':[True]
}]}
run_gridsearch(params_lr2, name='LogisticRegression2')

Running... LogisticRegression Grid Search:
Time Started: Wed Jan 26 22:58:52 2022
Time Finished: Wed Jan 26 23:00:39 2022

Mean Cross Validation Score: 0.7889831918788485
Median Cross Validation Score: 0.7889166250624065
Optimal parameters: {'classifier__C': 1, 'classifier__fit_intercept': True, 'classifier__solver': 'lbfgs'}


In [20]:
lr_best_params_cza = tuned_params['LogisticRegression2']
%store lr_best_params_cza

Stored 'lr_best_params_cza' (dict)


#### Decision Tree Grid Search

In [21]:
params_dt1 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__splitter':['best', 'random'],
    'classifier__max_depth':[2, 6, 10, 15],
    'classifier__min_samples_split': [5, 10],
    'classifier__min_samples_leaf': [3, 6],
}]}
run_gridsearch(params_dt1, name='DecisionTree1')

Running... DecisionTree Grid Search:
Time Started: Wed Jan 26 23:02:34 2022
Time Finished: Wed Jan 26 23:24:48 2022

Mean Cross Validation Score: 0.7876333936595108
Median Cross Validation Score: 0.8087119321018472
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 5, 'classifier__splitter': 'best'}


In [22]:
params_dt2 = {'DecisionTree': [{
    'classifier__criterion':['gini'],
    'classifier__splitter':['best'],
    'classifier__max_depth':[8, 10, 12],
    'classifier__min_samples_split': [3, 5, 8],
    'classifier__min_samples_leaf': [1, 3, 4],
}]}
run_gridsearch(params_dt2, name='DecisionTree2')

Running... DecisionTree Grid Search:
Time Started: Wed Jan 26 23:29:54 2022
Time Finished: Wed Jan 26 23:40:13 2022

Mean Cross Validation Score: 0.8227251715020062
Median Cross Validation Score: 0.8233150274588118
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 3, 'classifier__splitter': 'best'}


In [24]:
dt_best_params_cza = tuned_params['DecisionTree2']
%store dt_best_params_cza

Stored 'dt_best_params_cza' (dict)


#### Random Forest Grid Search

In [25]:
params_rf1 = {'RandomForest': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_rf1, name='RandomForest1')

Running... RandomForest Grid Search:
Time Started: Wed Jan 26 23:46:51 2022
Time Finished: Thu Jan 27 00:32:46 2022

Mean Cross Validation Score: 0.8263302962223332
Median Cross Validation Score: 0.8323764353469794
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}


In [27]:
params_rf2 = {'RandomForest': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 1, 3], 
    'classifier__min_samples_leaf':[1, 2, 3], 
    'classifier__min_samples_split':[8, 10, 13], 
    'classifier__n_estimators':[100, 150]
}]}
run_gridsearch(params_rf2, name='RandomForest2')

Running... RandomForest Grid Search:
Time Started: Thu Jan 27 00:35:40 2022
Time Finished: Thu Jan 27 01:40:56 2022

Mean Cross Validation Score: 0.8184519517020766
Median Cross Validation Score: 0.8104593110334498
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 8, 'classifier__n_estimators': 100}


In [28]:
params_rf3 = {'RandomForest': [{
    'classifier__max_depth':[None], 
    'classifier__min_samples_leaf':[3], 
    'classifier__min_samples_split':[7, 8, 9], 
    'classifier__n_estimators':[100, 120]
}]}
run_gridsearch(params_rf3, name='RandomForest3')

Running... RandomForest Grid Search:
Time Started: Thu Jan 27 01:42:32 2022
Time Finished: Thu Jan 27 01:49:06 2022

Mean Cross Validation Score: 0.8498751872191713
Median Cross Validation Score: 0.8497503744383426
Optimal parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 9, 'classifier__n_estimators': 100}


In [29]:
params_rf4 = {'RandomForest': [{
    'classifier__max_depth': [None], 
    'classifier__min_samples_leaf':[3], 
    'classifier__min_samples_split':[9, 10, 11], 
    'classifier__n_estimators':[100]
}]}
run_gridsearch(params_rf4, name='RandomForest4')

Running... RandomForest Grid Search:
Time Started: Thu Jan 27 01:54:52 2022
Time Finished: Thu Jan 27 01:57:40 2022

Mean Cross Validation Score: 0.8504742885671491
Median Cross Validation Score: 0.8507738392411384
Optimal parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}


In [30]:
params_rf5 = {'RandomForest': [{
    'classifier__criterion':['entropy'],
    'classifier__max_depth':[None], 
    'classifier__min_samples_leaf':[3], 
    'classifier__min_samples_split': [1, 10], 
    'classifier__n_estimators':[100],
    'classifier__max_features':['sqrt', 'log2'],
    'classifier__class_weight':['balanced', 'balanced_subsample']
}]}
run_gridsearch(params_rf5, name='RandomForest5')

Running... RandomForest Grid Search:
Time Started: Thu Jan 27 02:03:10 2022
Time Finished: Thu Jan 27 02:09:36 2022

Mean Cross Validation Score: nan
Median Cross Validation Score: nan
Optimal parameters: {'classifier__class_weight': 'balanced', 'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}


In [31]:
rf_best_params_cza = tuned_params['RandomForest4']
%store rf_best_params_cza

Stored 'rf_best_params_cza' (dict)


#### Extra Trees Grid Search

In [32]:
params_et1 = {'ExtraTrees': [{
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[None, 2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6]
}]}
run_gridsearch(params_et1, name='ExtraTrees1')

Running... ExtraTrees Grid Search:
Time Started: Thu Jan 27 02:15:04 2022
Time Finished: Thu Jan 27 03:03:09 2022

Mean Cross Validation Score: 0.8066634423364952
Median Cross Validation Score: 0.8133549675486771
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [33]:
params_extra2 = {'ExtraTrees': [{
    'regressor__n_estimators':[100, 200],
    'regressor__criterion':['gini'],
    'regressor__max_depth':[None, 1, 3],
    'regressor__min_samples_split':[1, 3, 5],
    'regressor__min_samples_leaf':[1, 2, 3]
    }]}
run_gridsearch(params_et1, name='ExtraTrees2')

Running... ExtraTrees Grid Search:
Time Started: Thu Jan 27 03:07:00 2022
Time Finished: Thu Jan 27 03:52:14 2022

Mean Cross Validation Score: 0.8066634423364952
Median Cross Validation Score: 0.8133549675486771
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [None]:
et_best_params_cza = tuned_params['ExtraTrees2']
%store et_best_params_cza

#### Gradient Boost Grid Search

In [34]:
params_gb1 = {'GradientBoost': [{
    'classifier__loss': ['deviance', 'exponential'],
    'classifier__n_estimators':[10, 30, 100],
    'classifier__criterion':['friedman_mse', 'squared_error'],
    'classifier__max_depth':[2, 6, 10],
    'classifier__min_samples_split':[5, 10],
    'classifier__min_samples_leaf':[3, 6],
}]}
run_gridsearch(params_gb1, name='GradientBoost1')

Running... GradientBoost Grid Search:
Time Started: Thu Jan 27 03:52:14 2022
Time Finished: Thu Jan 27 06:12:28 2022

Mean Cross Validation Score: nan
Median Cross Validation Score: nan
Optimal parameters: {'classifier__criterion': 'friedman_mse', 'classifier__loss': 'deviance', 'classifier__max_depth': 6, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [None]:
params_gb2 = {'GradientBoost': [{
    'classifier__loss': ['exponential'],
    'classifier__n_estimators':[100, 200, 300],
    'classifier__criterion':['friedman_mse'],
    'classifier__max_depth':[5, 6, 8],
    'classifier__min_samples_split':[8, 10, 13],
    'classifier__min_samples_leaf':[1, 3, 4]
}]}
run_gridsearch(params_gb2, name='GradientBoost2')

Running... GradientBoost Grid Search:
Time Started: Thu Jan 27 06:12:28 2022


In [None]:
gb_best_params_cza = tuned_params['GradientBoost1']
%store gb_best_params_cza

# Contact <a class="anchor" id="Contact"></a>

Any inquiries, please contact the contributors of this analysis:   
>[Czarina Luna](https://www.linkedin.com/in/czarinagluna)  
[Justin Sohn](https://www.linkedin.com/in/justin-sohn-689901193/)  
[Weston Shuken](https://www.linkedin.com/in/westonshuken/)