# Grid Search 
##### Authors: Czarina Luna, Weston Shuken, Justin Sohn

In [1]:
import pandas as pd
target = pd.read_csv('data/training_set_labels.csv')
data = pd.read_csv('data/training_set_features.csv')
data = data.merge(target, on="respondent_id").drop(columns=['respondent_id', 'seasonal_vaccine'])

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score, accuracy_score, precision_score, f1_score, roc_auc_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [3]:
def grab_numeric(df):
    return df.select_dtypes(['float64', 'int64'])

GrabNumeric = FunctionTransformer(grab_numeric)

In [4]:
num_features = list(range(0, 21)) + [31, 32]
cat_features = list(range(21, 31)) + [33, 34]

In [5]:
numeric_transformer = Pipeline(
    steps=[('grab_num', GrabNumeric), 
           ('num_impute', SimpleImputer(strategy='median', add_indicator=True)), 
           ('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
           ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)], remainder='passthrough')

In [6]:
def data_split(data, target):
    X = data.drop(columns=target)
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=10151997)
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = data_split(data, 'h1n1_vaccine')

In [8]:
X_train_processed = preprocessor.fit_transform(X_train)

In [None]:
def run_gridsearch(models, params, data, target):
    X_train, X_test, y_train, y_test = data_split(data, target)
    for model in params.keys():
        if model in models.keys():
            print(model, 'Grid Search:')
            pipe = Pipeline(steps=[('col_transformer', models[model]['preprocessor']), 
                             ('regressor', models[model]['regressor'])])
            grid = params[model]
            gridsearch = GridSearchCV(estimator=pipe, param_grid=grid, scoring='precision', cv=5)
            gridsearch.fit(X_train, y_train)
            print(f'Cross validation mean: {gridsearch.cv_results_["mean_test_score"]}')
            print(f'Test precision: {gridsearch.best_score_ :.2%}')
            print(f'Optimal prameters: {gridsearch.best_params_}')

#### Decision Tree Grid Search

In [9]:
DT_clf=DecisionTreeClassifier(random_state=42)

# params = {'criterion':['gini', 'entropy'],
#           'splitter':['best', 'random'],
#           'max_depth':[2, 6, 10],
#           'min_samples_split':[5, 10],
#           'min_samples_leaf':[3, 6]}

params = {'criterion':['gini', 'entropy']}


DT_GS = GridSearchCV(estimator=DT_clf,
                 param_grid=params,
                 cv=10, scoring='roc_auc')

In [10]:
DT_GS.fit(X_train_processed, y_train)
print(DT_GS.best_params_) 
print(DT_GS.best_score_)

{'criterion': 'entropy'}
0.6791901845688555


In [13]:
dt_params = DT_GS.best_params_
%store dt_params

Stored 'dt_params' (dict)


#### Logistic Regression Grid Search

#### Random Forest Grid Search

In [None]:
parameters = {'rfc__criterion': ['gini', 'entropy'],
          'rfc__min_samples_leaf': [1, 5, 10],
          'sm__k_neighbors': [3, 5, 9]}

In [None]:
RF_clf=RandomForestClassifier(random_state=42)

params = {'n_estimators':[10, 30, 100],
          'criterion':['gini', 'entropy'],
          'max_depth':[2, 6, 10],
          'min_samples_split':[5, 10],
          'min_samples_leaf':[3, 6]}

In [None]:
gs = GridSearchCV(estimator=RF_clf,
                 param_grid=params,
                 cv=10, scoring='roc_auc')

In [None]:
gs.fit(X_train_clean,y_train_clean)
print(gs.best_params_) 
print(gs.best_score_)

#### Gradient Boost Grid Search

In [None]:
GB_clf=GradientBoostingClassifier(random_state=42)

params = {'loss': ['deviance', 'exponential'],
          'n_estimators':[10, 30, 100],
          'criterion':['friedman_mse', 'squared_error'],
          'max_depth':[2, 6, 10],
          'min_samples_split':[5, 10],
          'min_samples_leaf':[3, 6]}

GB_GS = GridSearchCV(estimator=GB_clf,
                 param_grid=params,
                 cv=10, scoring='roc_auc')

In [None]:
GB_GS.fit(X_train_clean,y_train_clean)
print(GB_GS.best_params_) 
print(GB_GS.best_score_)

# Contact <a class="anchor" id="Contact"></a>

Any inquiries, please contact the contributors of this analysis:   
>[Czarina Luna](https://www.linkedin.com/in/czarinagluna)  
[Justin Sohn](https://www.linkedin.com/in/justin-sohn-689901193/)  
[Weston Shuken](https://www.linkedin.com/in/westonshuken/)