# Hyper parameter Tuning

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

iris = load_iris()

### Approach 1: Use train_test_split and manually tune parameters by trial and error

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [3]:
lr = LogisticRegression(C=1, solver='liblinear', multi_class='ovr')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9333333333333333

In [4]:
lr = LogisticRegression(C=10, solver='liblinear', multi_class='ovr')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9777777777777777

### Approach 2: Use K Fold Cross validation

In [5]:
from sklearn.model_selection import cross_val_score
cross_val_score(SVC(C=1,gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [6]:
from sklearn.model_selection import cross_val_score
cross_val_score(SVC(C=50,gamma='auto'), iris.data, iris.target, cv=5)

array([1.        , 0.96666667, 0.9       , 0.93333333, 1.        ])

### Approach 3: Use GridSearchCV

In [7]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(SVC(gamma='auto'),{
    'C': [1,10,20,30,40,50],
    'kernel': ['rbf','linear'],
}, cv=5, return_train_score=False)

In [8]:
gs.fit(iris.data, iris.target)

In [9]:
gs.cv_results_

{'mean_fit_time': array([0.00100222, 0.0027688 , 0.00320158, 0.        , 0.00116367,
        0.00050273, 0.        , 0.00066195, 0.        , 0.0026854 ,
        0.        , 0.        ]),
 'std_fit_time': array([0.00200443, 0.00350174, 0.00392119, 0.        , 0.0016853 ,
        0.00100546, 0.        , 0.00132389, 0.        , 0.00492593,
        0.        , 0.        ]),
 'mean_score_time': array([0.00025134, 0.        , 0.        , 0.00038409, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.0005754 , 0.        ]),
 'std_score_time': array([0.00050268, 0.        , 0.        , 0.00076818, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.0011508 , 0.        ]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtype=obje

To better understand it lets convert it into Pandas dataframe

In [10]:
df = pd.DataFrame(gs.cv_results_)

df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001002,0.002004,0.000251,0.000503,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.002769,0.003502,0.0,0.0,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.003202,0.003921,0.0,0.0,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.0,0.0,0.000384,0.000768,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001164,0.001685,0.0,0.0,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000503,0.001005,0.0,0.0,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
6,0.0,0.0,0.0,0.0,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,8
7,0.000662,0.001324,0.0,0.0,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,8
8,0.0,0.0,0.0,0.0,40,rbf,"{'C': 40, 'kernel': 'rbf'}",1.0,0.966667,0.9,0.933333,1.0,0.96,0.038873,8
9,0.002685,0.004926,0.0,0.0,40,linear,"{'C': 40, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,8


In [11]:
df[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'C': 1, 'kernel': 'rbf'}",0.98
1,"{'C': 1, 'kernel': 'linear'}",0.98
2,"{'C': 10, 'kernel': 'rbf'}",0.98
3,"{'C': 10, 'kernel': 'linear'}",0.973333
4,"{'C': 20, 'kernel': 'rbf'}",0.966667
5,"{'C': 20, 'kernel': 'linear'}",0.966667
6,"{'C': 30, 'kernel': 'rbf'}",0.96
7,"{'C': 30, 'kernel': 'linear'}",0.96
8,"{'C': 40, 'kernel': 'rbf'}",0.96
9,"{'C': 40, 'kernel': 'linear'}",0.96


In [12]:
gs.best_params_

{'C': 1, 'kernel': 'rbf'}

In [13]:
gs.best_score_

0.9800000000000001

### RandomizedSearchCV
Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation


In [14]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(SVC(gamma='auto'),{
    'C': [1,10,20,30,40,50],
    'kernel': ['rbf','linear'],
}, cv=5, n_iter=3, return_train_score=False)

In [15]:
rs.fit(iris.data, iris.target)

In [16]:
df2 = pd.DataFrame(rs.cv_results_)
df2[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'kernel': 'rbf', 'C': 20}",0.966667
1,"{'kernel': 'linear', 'C': 30}",0.96
2,"{'kernel': 'rbf', 'C': 1}",0.98


### Different models with different hyperparameters

In [19]:
model_dict = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [20]:
scores = []

for model_name, model_params in model_dict.items():
    clf = GridSearchCV(model_params['model'],model_params['params'], cv=5, return_train_score=False)
    clf.fit(iris.data,iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'params': clf.best_params_
    })

df3 = pd.DataFrame(scores, columns=['model','params','best_score'])
df3

Unnamed: 0,model,params,best_score
0,svm,"{'C': 1, 'kernel': 'rbf'}",0.98
1,random_forest,{'n_estimators': 5},0.953333
2,logistic_regression,{'C': 5},0.966667


**Results**: Based on above, I can conclude that **SVM** with **C=1** and **kernel='rbf'** is the **best model** for solving my problem of iris flower classification