# Tuning Voting Models with Hyper-Parameters
- [very useful link that explains tuning the hyper-parameters](https://scikit-learn.org/stable/modules/grid_search.html#grid-search)
- [Parameter estimation using grid search with cross-validation](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html)

Since Tuning hyper-parameters is time expensive this notebokk is for testing each model to find the best parameters for each model used in voting classifier , so that I use the parameters without testing all models for all possible parameters that might improve the testing score.

In [62]:
import pandas as pd
import numpy as np
import pickle
import time
import json

# models 
from sklearn import ensemble
from sklearn import gaussian_process
from sklearn import linear_model 
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn import svm
from sklearn import tree
from sklearn import discriminant_analysis
from sklearn import model_selection


In [2]:
# loading variables from the other Notebook
# https://ipython.org/ipython-doc/rel-0.12/config/extensions/storemagic.html
%store -r train_set_cpy
%store -r Target
%store -r train_x_bins
%store -r cross_validator_split

In [51]:
grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.03, .05, .1, .25]
grid_max_depth = [2, 3 , 4, 6, 8, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
min_samples_leaf = [1,5,10,.03,.05]
max_features = [None, 'auto','log2','sqrt']
grid_seed = [0]


# 1 |  Ensemble Methods

In [53]:
#http://scikit-learn.org/stable/modules/ensemble.html
est_1 = [
    ('abc',ensemble.AdaBoostClassifier()),
    ('bc', ensemble.BaggingClassifier()),
    ('etc', ensemble.ExtraTreesClassifier()),
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('rfc', ensemble.RandomForestClassifier())
]

In [54]:

param_grid_1 = [
            # AdaBoostClassifier 
            # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html?highlight=adaboostclassifier#sklearn.ensemble.AdaBoostClassifier
            {
            'n_estimators': grid_n_estimator, #default=50
            'learning_rate': grid_learn, #default=1
            'algorithm': ['SAMME.R'], #default=’SAMME.R
            'random_state': grid_seed
            },
            #BaggingClassifier 
            # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
            {
            'n_estimators' : grid_n_estimator, #default=10
            'max_features' : grid_ratio,
            'random_state' : grid_seed
            },
            {
             # ExtraTreesClassifier
            # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
            'n_estimators' : [100, 300 , 500], #defualt = 100
            'criterion' : grid_criterion,
            'max_depth' : grid_max_depth, #defualt = None
            'min_samples_leaf' : min_samples_leaf,
            'max_features' : max_features,
            'random_state' : grid_seed
            },
            {
            # GradientBoostingClassifier
            # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
            'learning_rate' :grid_learn,
            'n_estimators' : [100, 300 , 500], #defualt = 100
            'min_samples_leaf' : min_samples_leaf,
            'max_depth' : grid_max_depth,
            'max_features' : max_features,
            'random_state' : grid_seed
            },
            {
            # RandomForestClassifier
            # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
            'n_estimators' : [100, 300 , 500], #defualt = 100
            'criterion' : grid_criterion,
            'max_depth' : grid_max_depth,
            'oob_score' : [True],
            'random_state' : grid_seed
            }
            ]

In [55]:
# To test and see how zip will work 
for clf,param in zip(est_1,param_grid_1):
    print("Name : {}\nAlgorithm: {}".format(clf[0],clf[1]))
    print("Parameters grid : {}".format(param))
    print('-'*30)

Name : abc
Algorithm: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
Parameters grid : {'n_estimators': [10, 50, 100, 300], 'learning_rate': [0.03, 0.05, 0.1, 0.25], 'algorithm': ['SAMME.R'], 'random_state': [0]}
------------------------------
Name : bc
Algorithm: BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)
Parameters grid : {'n_estimators': [10, 50, 100, 300], 'max_features': [0.1, 0.25, 0.5, 0.75, 1.0], 'random_state': [0]}
------------------------------
Name : etc
Algorithm: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,


In [59]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
save_best_params = {}
def findBestParam(est , param_grid):
    start_total = time.perf_counter()
    for clf,param in zip(est,param_grid):
        start = time.perf_counter()
        tune_model = model_selection.GridSearchCV(clf[1],
                                                 param_grid = param,
                                                 scoring = 'roc_auc',
                                                 cv = cross_validator_split,
                                                 return_train_score = True)
    
        tune_model.fit(train_set_cpy[train_x_bins],train_set_cpy[Target].values.ravel())
    
        run = time.perf_counter() - start
    
        alg_name = clf[1].__class__.__name__ 
        params = tune_model.best_params_
        
        print("The best parameters for {} are {} with runtime of {:.2f} seconds".format(
        alg_name, params , run))
    
        # adding to the dict that will be saved later using save() function above
        save_best_params[alg_name] = params
    
    run_total = time.perf_counter() - start_total
    print("Total optimization time was {:.2f} minutes".format(run_total/60))
    print('-'*30)

In [60]:
findBestParam(est_1,param_grid_1)

The best parameters for AdaBoostClassifier are {'algorithm': 'SAMME.R', 'learning_rate': 0.03, 'n_estimators': 300, 'random_state': 0} with runtime of 30.30 seconds
The best parameters for BaggingClassifier are {'max_features': 0.5, 'n_estimators': 300, 'random_state': 0} with runtime of 33.71 seconds
The best parameters for ExtraTreesClassifier are {'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 300, 'random_state': 0} with runtime of 2183.75 seconds
The best parameters for GradientBoostingClassifier are {'learning_rate': 0.05, 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 0.03, 'n_estimators': 100, 'random_state': 0} with runtime of 3892.33 seconds
The best parameters for RandomForestClassifier are {'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 300, 'oob_score': True, 'random_state': 0} with runtime of 188.59 seconds
Total optimization time was 105.48 minutes
------------------------------


# 2 |  Gaussian Processes

In [64]:
#http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
est_2 = [
    ('gpc', gaussian_process.GaussianProcessClassifier())
]

In [65]:
# GaussianProcessClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html?highlight=gaussianprocessclassifier#sklearn.gaussian_process.GaussianProcessClassifier
param_grid_2 = [
            
            {
            'max_iter_predict': grid_n_estimator, #default=100
            'random_state': grid_seed
            }
            ]

In [66]:
findBestParam(est_2,param_grid_2)

The best parameters for GaussianProcessClassifier are {'max_iter_predict': 10, 'random_state': 0} with runtime of 3.00 seconds
Total optimization time was 0.05 minutes
------------------------------


# 3 |  GLM

In [75]:
#http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
est_3 = [
    ('lr', linear_model.LogisticRegressionCV())
]

In [76]:
# LogisticRegressionCV
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
param_grid_3 = [
            {
            'fit_intercept': grid_bool, #default=100
            'solver': ['liblinear', 'sag', 'saga'], #default: lbfgs
            'random_state' : grid_seed,
            }
            ]

In [77]:
findBestParam(est_3,param_grid_3)

































The best parameters for LogisticRegressionCV are {'fit_intercept': True, 'random_state': 0, 'solver': 'saga'} with runtime of 7.85 seconds
Total optimization time was 0.13 minutes
------------------------------




# 4 |  Navies Bayes

In [None]:
#http://scikit-learn.org/stable/modules/naive_bayes.html
est_4 = [
    ('bnb', naive_bayes.BernoulliNB()),
    ('gnb', naive_bayes.GaussianNB())
]

In [61]:
# BernoulliNB
# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB

# GaussianNB
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB

param_grid_4 = [
            {
            'alpha': grid_ratio #default=1.0
            },
            {}
            ]

In [None]:
findBestParam(est_4,param_grid_4)

# 5 |  Nearest Neighbor

In [None]:
#http://scikit-learn.org/stable/modules/neighbors.html
est_5 = [
    ('knc', neighbors.KNeighborsClassifier())
]

In [None]:
# KNeighborsClassifier
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier

param_grid_5 = [
            {
            'n_neighbors': [1,2,3,4,5,6,7], #default = 5
            'weights' : ["uniform","distance"] ,
            'algorithm' : ['auto' , 'ball_tree' , 'kd_tree' , 'brute'] , # defualt = auto
            'leaf_size' : [10,20,30,40,50] # default = 30
            }
            ]


In [None]:
findBestParam(est_5,param_grid_5)

# 6 |  SVM

In [None]:
# SVC
#http://scikit-learn.org/stable/modules/svm.html
est_6 = [
    ('svc', svm.SVC(probability=True))
]

In [None]:
# SVC
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

param_grid_6 = [
    {
        'C' : [1,2,3,4,5,6,7] # defualt = 1.0
        'gamma': ['scale' , 'auto'] ,
        'probability' : [True],
        'decision_function_shape' : ['ovo','ovr'],
        'random_state' : grid_seed
    }
]

In [None]:
findBestParam(est_6,param_grid_6)

# 7 |  Decision Tree

In [None]:
est_7 = [
    ('dtc', tree.DecisionTreeClassifier(max_depth = 4,
                                       min_samples_leaf = 5))
]

In [None]:
# Decision Tree
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

param_grid_7 = [
    {
        'criterion' :  ['gini'],
        'splitter': ['best'],
        'max_depth': [2,4,6,None],
        'min_samples_split': [2,5,10],
        'min_samples_leaf': [1,5,10],
        'max_features': [None, 'auto','log2','sqrt'],
        'random_state': grid_seed
    }
]

In [None]:
findBestParam(est_7,param_grid_7)

# Save the results in a file

In [74]:
print(save_best_params)

{'AdaBoostClassifier': {'algorithm': 'SAMME.R', 'learning_rate': 0.03, 'n_estimators': 300, 'random_state': 0}, 'BaggingClassifier': {'max_features': 0.5, 'n_estimators': 300, 'random_state': 0}, 'ExtraTreesClassifier': {'criterion': 'gini', 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 300, 'random_state': 0}, 'GradientBoostingClassifier': {'learning_rate': 0.05, 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 0.03, 'n_estimators': 100, 'random_state': 0}, 'RandomForestClassifier': {'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 300, 'oob_score': True, 'random_state': 0}, 'GaussianProcessClassifier': {'max_iter_predict': 10, 'random_state': 0}}


In [52]:
# https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/
#results_file = "BestParams" 
def save(dictionary):
    with open('BestParams.txt', 'w') as outfile:
        json.dump(dictionary, outfile)

save(save_best_params)

In [None]:

#http://scikit-learn.org/stable/modules/ensemble.html
m1 = ensemble.AdaBoostClassifier()

model_results = model_selection.cross_validate(m1,
                                                train_set_cpy[train_x_bins],
                                                train_set_cpy[Target],
                                                cv  = cross_validator_split,
                                               return_train_score = True)


m1.fit(train_set_cpy[train_x_bins],train_set_cpy[Target].values.ravel())

print('BEFORE DT Parameters: ', m1.get_params())
print("BEFORE DT Training score mean: {:.2f}". format(model_results['train_score'].mean()*100)) 
print("BEFORE DT Test score mean: {:.2f}". format(model_results['test_score'].mean()*100))

print('-'*10)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(),
                                          param_grid=param_grid,
                                          scoring = 'roc_auc',
                                          cv = cross_validator_split,
                                         return_train_score = True)

tune_model.fit(train_set_cpy[train_x_bins],train_set_cpy[Target])
#print(tune_model.cv_results_.keys())
#print('-'*10)
#print(tune_model.cv_results_['params'])



print('AFTER DT Parameters: ', tune_model.best_params_)
print(tune_model.cv_results_['mean_train_score'])
print('-'*10)
print("AFTER DT Training w/bin score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print(tune_model.cv_results_['mean_test_score'])
print('-'*10)
print("AFTER DT Test w/bin score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print('-'*10)