<a href="https://colab.research.google.com/github/suryagokul/Data-Science-Portfolio/blob/master/Hyper_Parameter_Tuning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyper Parameter Tuning

          Techniques:
                     
                 1)RandomizedSearchCV
                  
                 2)GridSearchCV
                      
                 3)Automated Hyperparameter Tuning  Bayesian Optimization (hyperopt)

                 4)Genetic Algorithms (TPOT Classifier)

                 5)Optuna- Automate Hyperparameter Tuning






                    

In [3]:
from seaborn import load_dataset

import pandas as pd

from sklearn.datasets import load_breast_cancer

import numpy as np

In [4]:
bc = load_breast_cancer()


In [5]:
X = bc.data
y = bc.target

In [6]:
from sklearn.model_selection import train_test_split

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [86]:
y_test.shape

(188,)

In [87]:
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

**Before RandomizedSearchCV**

In [10]:
rf = RandomForestClassifier()

rf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)>

In [11]:
rf.fit(X_train,y_train)

r_pred = rf.predict(X_test)

rf.score(X_test,y_test)

0.9574468085106383

Our score is pretty good before hyperparameter tuning techniques like given above

# RandomizedSearchCV

In [12]:
params = {'criterion':['gini','entropy'],
          'n_estimators':[int(i) for i in np.linspace(150,2000,15)],
          'max_depth':[int(i) for i in [5,10,20,100,200]],
          'max_features':['auto','sqrt','log2'],
          'min_samples_leaf':[1,3,5,7,9,11,15],
          'min_samples_split':[5,7,8,10,12],
          }

In [13]:
rf_Cv = RandomizedSearchCV(rf,param_distributions=params,cv=5)

In [14]:
rf_Cv.fit(X_train,y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [15]:
rf_Cv.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 810}

In [16]:
rf_best = rf_Cv.best_estimator_

In [17]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [18]:
y_pred = rf_best.predict(X_test)

In [19]:
accuracy_score(y_test,y_pred)

0.9680851063829787

After applying RandomizedSearchCV  score get increased slightly

In [20]:
confusion_matrix(y_test,y_pred)

array([[ 63,   4],
       [  2, 119]])

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        67
           1       0.97      0.98      0.98       121

    accuracy                           0.97       188
   macro avg       0.97      0.96      0.96       188
weighted avg       0.97      0.97      0.97       188



# GridSearchCV

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
params = {'criterion':['gini','entropy'],
          'n_estimators':[int(i) for i in np.linspace(50,80,2)],
          'max_depth':[int(i) for i in [10,20]],
          'max_features':['auto','sqrt','log2'],
          }

In [30]:
rf_grid = GridSearchCV(estimator=rf,param_grid=params,n_jobs=-1,cv=3,verbose=1)

In [31]:
rf_grid.fit(X_train,y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    7.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [32]:
rf_grid.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'n_estimators': 80}

In [99]:
gd_pred = rf_grid.predict(X_test)

In [100]:
accuracy_score(y_test,gd_pred)

0.9521276595744681

In [101]:
confusion_matrix(y_test,gd_pred)

array([[ 62,   5],
       [  4, 117]])

In [102]:
print(classification_report(y_test,gd_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93        67
           1       0.96      0.97      0.96       121

    accuracy                           0.95       188
   macro avg       0.95      0.95      0.95       188
weighted avg       0.95      0.95      0.95       188



# Automated Hyperparameter Tuning
Automated Hyperparameter Tuning can be done by using techniques such as


1.  Bayesian Optimization
2.   Gradient Descent
3.  Evolutionary Algorithms

**Bayesian Optimization**

> Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time. In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.


1. Objective Function = defines the loss function to minimize.
2. Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
3. Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.
 




In [33]:
!pip install hyperopt 



In [88]:
from hyperopt import hp,tpe,fmin,Trials,STATUS_OK

In [92]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [93]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x7f7068d03c88>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x7f7068dc94e0>,
 'max_features': <hyperopt.pyll.base.Apply at 0x7f7068dc9668>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x7f7068dc9780>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x7f7068dc94a8>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x7f7068dc99b0>}

In [94]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [95]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [09:20<00:00,  7.01s/it, best loss: -0.9501025290498974]


{'criterion': 1,
 'max_depth': 700.0,
 'max_features': 2,
 'min_samples_leaf': 0.0110157452134266,
 'min_samples_split': 0.014308212012269774,
 'n_estimators': 4}

In [96]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
log2
1200


In [97]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[ 63   4]
 [  2 119]]
0.9680851063829787
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        67
           1       0.97      0.98      0.98       121

    accuracy                           0.97       188
   macro avg       0.97      0.96      0.96       188
weighted avg       0.97      0.97      0.97       188



# Genetic Algorithms

In [103]:
!pip install tpot

Collecting tpot
[?25l  Downloading https://files.pythonhosted.org/packages/14/5e/cb87b0257033a7a396e533a634079ee151a239d180efe2a8b1d2e3584d23/TPOT-0.11.5-py3-none-any.whl (82kB)
[K     |████████████████████████████████| 92kB 1.3MB/s 
Collecting deap>=1.2
[?25l  Downloading https://files.pythonhosted.org/packages/0a/eb/2bd0a32e3ce757fb26264765abbaedd6d4d3640d90219a513aeabd08ee2b/deap-1.3.1-cp36-cp36m-manylinux2010_x86_64.whl (157kB)
[K     |████████████████████████████████| 163kB 3.8MB/s 
[?25hCollecting stopit>=1.1.1
  Downloading https://files.pythonhosted.org/packages/35/58/e8bb0b0fb05baf07bbac1450c447d753da65f9701f551dca79823ce15d50/stopit-1.1.2.tar.gz
Collecting update-checker>=0.16
  Downloading https://files.pythonhosted.org/packages/0c/ba/8dd7fa5f0b1c6a8ac62f8f57f7e794160c1f86f31c6d0fb00f582372a3e4/update_checker-0.18.0-py3-none-any.whl
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: file

In [104]:
from tpot import TPOTClassifier

In [105]:
param_tpot = {'criterion':['gini','entropy'],
          'n_estimators':[int(i) for i in np.linspace(150,2000,15)],
          'max_depth':[int(i) for i in [5,10,20,100,200]],
          'max_features':['auto','sqrt','log2'],
          'min_samples_leaf':[1,3,5,7,9,11,15],
          'min_samples_split':[5,7,8,10,12],
          }

In [110]:
tpotclf = TPOTClassifier(generations=5,population_size=15,offspring_size=5,verbosity=1,cv=3,scoring='accuracy',early_stop=10,config_dict={'sklearn.ensemble.RandomForestClassifier':param_tpot})

In [111]:
tpotclf.fit(X_train,y_train)

Best pipeline: RandomForestClassifier(CombineDFs(input_matrix, input_matrix), criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=12, n_estimators=150)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['gini',
                                                                                      'entropy'],
                                                                        'max_depth': [5,
                                                                                      10,
                                                                                      20,
                                                                                      100,
                                                                                      200],
                                                                        'max_features': ['auto',
                                                                                         'sqrt',
                                                                                         'log2'],
                                                                

In [118]:
tpotclf.evaluated_individuals_

{'RandomForestClassifier(CombineDFs(CombineDFs(input_matrix, input_matrix), input_matrix), RandomForestClassifier__criterion=entropy, RandomForestClassifier__max_depth=200, RandomForestClassifier__max_features=sqrt, RandomForestClassifier__min_samples_leaf=3, RandomForestClassifier__min_samples_split=8, RandomForestClassifier__n_estimators=942)': {'crossover_count': 0,
  'generation': 5,
  'internal_cv_score': 0.942257217847769,
  'mutation_count': 1,
  'operator_count': 1,
  'predecessor': ('RandomForestClassifier(CombineDFs(input_matrix, input_matrix), RandomForestClassifier__criterion=entropy, RandomForestClassifier__max_depth=200, RandomForestClassifier__max_features=sqrt, RandomForestClassifier__min_samples_leaf=3, RandomForestClassifier__min_samples_split=8, RandomForestClassifier__n_estimators=942)',)},
 'RandomForestClassifier(CombineDFs(input_matrix, input_matrix), RandomForestClassifier__criterion=entropy, RandomForestClassifier__max_depth=10, RandomForestClassifier__max_feat

In [117]:
accuracy = tpotclf.score(X_test, y_test)
print(accuracy)


0.9627659574468085


# Optuna

In [119]:
!pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/67/2b/78129c5580f1743d897214e2d76c20bea8d41c414be6ef1b92c0ce13856e/optuna-2.2.0.tar.gz (246kB)
[K     |████████████████████████████████| 256kB 1.4MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/71/06/03b1f92d46546a18eabf33ff7f37ef422c18c93d5a926bf590fee32ebe75/cliff-3.4.0-py3-none-any.whl (76kB)
[K     |████████████████████████████████| 81kB 3.4MB/s 
Collecting cmaes>=0.6.0
  Downloading https://files.pythonhosted.org/packages/8d/3c/06c76ec8b54b9b1fad7f35e903fd25010fe3e0d41bd94cea5e6f12e0d651/cmaes-0.7.0-py3-none-any.whl
Collecting colorlog
  Downloading https://files.pythonhosted.org/packages/8c/10/0b39be7ff1adb8888fe87c8628c071dec5ac282ac1c2312221f5feb09215/colorlog-4.4.0-py2.py3-none-any.whl
Collecting al

In [120]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [122]:
study = optuna.create_study(direction='maximize')     # maximizing accuracy,if it is minimize then we are minimizing the loss

study.optimize(objective,n_trials=100)

[32m[I 2020-10-24 10:56:03,776][0m A new study created in memory with name: no-name-dc0604c4-ffb8-42df-86ed-5d656ec20cd4[0m
[32m[I 2020-10-24 10:56:07,465][0m Trial 0 finished with value: 0.94750656167979 and parameters: {'classifier': 'RandomForest', 'n_estimators': 670, 'max_depth': 31.67872403632954}. Best is trial 0 with value: 0.94750656167979.[0m
[32m[I 2020-10-24 10:56:13,306][0m Trial 1 finished with value: 0.94750656167979 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1380, 'max_depth': 21.84830610968625}. Best is trial 0 with value: 0.94750656167979.[0m
[32m[I 2020-10-24 10:56:14,782][0m Trial 2 finished with value: 0.9448818897637795 and parameters: {'classifier': 'RandomForest', 'n_estimators': 350, 'max_depth': 93.53220326879874}. Best is trial 0 with value: 0.94750656167979.[0m
[32m[I 2020-10-24 10:56:14,811][0m Trial 3 finished with value: 0.6194225721784777 and parameters: {'classifier': 'SVC', 'svc_c': 6.190063758385438e-05}. Best is tria

In [123]:
study.best_value


0.9580052493438319

In [124]:
study.best_params

{'classifier': 'RandomForest',
 'max_depth': 48.056969652447286,
 'n_estimators': 360}

In [125]:
rf_opt = RandomForestClassifier(max_depth=48.056969652447286,n_estimators=360)

rf_opt.fit(X_train,y_train)

opt_pred=rf_opt.predict(X_test)
print(confusion_matrix(y_test,opt_pred))
print(accuracy_score(y_test,opt_pred))
print(classification_report(y_test,opt_pred))

[[ 62   5]
 [  4 117]]
0.9521276595744681
              precision    recall  f1-score   support

           0       0.94      0.93      0.93        67
           1       0.96      0.97      0.96       121

    accuracy                           0.95       188
   macro avg       0.95      0.95      0.95       188
weighted avg       0.95      0.95      0.95       188

