In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
data_iris = load_iris()

We will be learning Hyperparameter tuning here and we will be using the below techniques:
1. Manual test_train_split --> running the train_test_split again and changing the samples
2. Using the K-Fold cross validation method 
3. GridSearchCV
4. RandomSearchCV

## Approach 1:

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data_iris.data,data_iris.target,test_size=0.2)

In [12]:
from sklearn.svm import SVC
model=SVC(C=3.0)
model.fit(X_train,y_train)



SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [7]:
model.score(X_test,y_test)

1.0

In [13]:
model.score(X_test,y_test)

0.9666666666666667

## Approach 2

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
model_svm = SVC()
model_lr=LogisticRegression()
model_nb=GaussianNB()

cross_val_score(model_svm,data_iris.data,data_iris.target,cv=5)



array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [20]:
cross_val_score(model_lr,data_iris.data,data_iris.target,cv=5)



array([1.        , 0.96666667, 0.93333333, 0.9       , 1.        ])

In [21]:
cross_val_score(model_nb,data_iris.data,data_iris.target,cv=5)

array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [25]:
cross_val_score(model_nb,data_iris.data,data_iris.target,cv=5).mean()

0.9533333333333334

In [29]:
#in the below code we have checked the model tuning, which model is performing better

model=[model_lr,model_nb,model_svm]

for model in model:
    best_score=cross_val_score(model,data_iris.data,data_iris.target,cv=5).mean()
    print('Model Name: {}\t Score: {}'.format(model,best_score))



Model Name: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)	 Score: 0.9600000000000002
Model Name: GaussianNB(priors=None, var_smoothing=1e-09)	 Score: 0.9533333333333334
Model Name: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)	 Score: 0.9800000000000001




## Approach 3: Avoiding the manual approach in the previous step

Parameter tuning using the same algo and tweaking the parameters
We have used the SVM algo with various parameters and could see which parameters will give the best ressult

In [45]:
import warnings
warnings.filterwarnings('ignore')

kernel=['linear', 'poly', 'rbf']
C=[1,3,5,10]

for kernel in kernel:
    for c in C:
        score=cross_val_score(SVC(C=c,kernel=kernel),data_iris.data,data_iris.target,cv=5)
        average_score=score.mean()
        print('Kernel Name: {}, C: {}, Score: {}'.format(kernel,c,average_score))
    print('*************************')

Kernel Name: linear, C: 1, Score: 0.9800000000000001
Kernel Name: linear, C: 3, Score: 0.9733333333333334
Kernel Name: linear, C: 5, Score: 0.9800000000000001
Kernel Name: linear, C: 10, Score: 0.9733333333333334
*************************
Kernel Name: poly, C: 1, Score: 0.9666666666666666
Kernel Name: poly, C: 3, Score: 0.9666666666666666
Kernel Name: poly, C: 5, Score: 0.9666666666666666
Kernel Name: poly, C: 10, Score: 0.9666666666666666
*************************
Kernel Name: rbf, C: 1, Score: 0.9800000000000001
Kernel Name: rbf, C: 3, Score: 0.9733333333333334
Kernel Name: rbf, C: 5, Score: 0.9800000000000001
Kernel Name: rbf, C: 10, Score: 0.9800000000000001
*************************


From the above output we can see that rbf kernel is giving a very good score. We can select the parameters depending on the output

## Approach 4: GridSearchCV

In [46]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5, 10]}
svc=SVC()
clf=GridSearchCV(svc,parameters)

In [47]:
clf.fit(data_iris.data,data_iris.target)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [48]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'fit_params',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score',
 'scorer_',
 'scoring',
 'set_params',
 'transform

In [49]:
clf.best_score_

0.98

In [50]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

This below command gives the result of the GridSearch.
cv_results can be saved into a DF.

In [51]:
clf.cv_results_

{'mean_fit_time': array([0.00063912, 0.00114759, 0.00068927, 0.00150752, 0.00095916,
        0.00170239]),
 'std_fit_time': array([4.53437637e-04, 6.49621004e-04, 4.88047781e-04, 5.09122765e-06,
        6.03571125e-05, 4.94753895e-04]),
 'mean_score_time': array([0.00066853, 0.000494  , 0.00061599, 0.00049782, 0.00066773,
        0.00067592]),
 'std_score_time': array([4.72719213e-04, 4.09418550e-04, 2.09733770e-04, 5.40882731e-06,
        4.72157255e-04, 3.99863901e-04]),
 'param_C': masked_array(data=[1, 1, 5, 5, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 5, 'kernel': 'linear'},
  {'C': 5, 'kernel': 'rbf'},
  {'C': 10, 'kernel'

In [52]:
gridresults=pd.DataFrame(clf.cv_results_)

In [53]:
gridresults

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.000639,0.000453,0.000669,0.000473,1,linear,"{'C': 1, 'kernel': 'linear'}",1.0,0.960784,0.979167,0.98,0.016179,1,0.979798,1.0,0.990196,0.989998,0.008249
1,0.001148,0.00065,0.000494,0.000409,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.980392,0.960784,0.979167,0.973333,0.009021,4,0.969697,1.0,0.980392,0.983363,0.012548
2,0.000689,0.000488,0.000616,0.00021,5,linear,"{'C': 5, 'kernel': 'linear'}",1.0,0.901961,1.0,0.966667,0.046442,6,0.969697,1.0,0.980392,0.983363,0.012548
3,0.001508,5e-06,0.000498,5e-06,5,rbf,"{'C': 5, 'kernel': 'rbf'}",0.980392,0.960784,1.0,0.98,0.015925,1,0.969697,1.0,0.980392,0.983363,0.012548
4,0.000959,6e-05,0.000668,0.000472,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,0.921569,1.0,0.973333,0.037154,4,0.959596,1.0,0.980392,0.979996,0.016497
5,0.001702,0.000495,0.000676,0.0004,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.980392,0.960784,1.0,0.98,0.015925,1,0.959596,1.0,0.980392,0.979996,0.016497


In [54]:
gridresults[['mean_fit_time','mean_score_time','param_C','param_kernel','mean_test_score','mean_train_score']]

Unnamed: 0,mean_fit_time,mean_score_time,param_C,param_kernel,mean_test_score,mean_train_score
0,0.000639,0.000669,1,linear,0.98,0.989998
1,0.001148,0.000494,1,rbf,0.973333,0.983363
2,0.000689,0.000616,5,linear,0.966667,0.983363
3,0.001508,0.000498,5,rbf,0.98,0.983363
4,0.000959,0.000668,10,linear,0.973333,0.979996
5,0.001702,0.000676,10,rbf,0.98,0.979996


## Approach 5: RandomizedSearchCV

In [59]:
from sklearn.model_selection import RandomizedSearchCV
svm1=SVC()
parameters={'kernel': ('linear','rbf'),'C':[1,3,5,10]}

clf=RandomizedSearchCV(svm1,parameters,n_iter=5)
clf.fit(data_iris.data,data_iris.target)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'kernel': ('linear', 'rbf'), 'C': [1, 3, 5, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [60]:
clf.best_score_

0.98

In [64]:
randomData=pd.DataFrame(clf.cv_results_)

In [65]:
randomData

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001337,0.000405,0.0002,0.000217,linear,5,"{'kernel': 'linear', 'C': 5}",1.0,0.901961,1.0,0.966667,0.046442,4,0.969697,1.0,0.980392,0.983363,0.012548
1,0.000948,3.9e-05,0.000693,0.000221,rbf,3,"{'kernel': 'rbf', 'C': 3}",0.980392,0.980392,0.979167,0.98,0.000572,1,0.969697,1.0,0.970588,0.980095,0.01408
2,0.002693,0.002292,0.001998,0.00217,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.980392,0.960784,1.0,0.98,0.015925,1,0.959596,1.0,0.980392,0.979996,0.016497
3,0.001067,5.3e-05,0.001094,0.000288,rbf,5,"{'kernel': 'rbf', 'C': 5}",0.980392,0.960784,1.0,0.98,0.015925,1,0.969697,1.0,0.980392,0.983363,0.012548
4,0.006374,0.007529,0.001003,8.1e-05,linear,3,"{'kernel': 'linear', 'C': 3}",0.980392,0.921569,1.0,0.966667,0.033333,4,0.969697,1.0,0.980392,0.983363,0.012548


In [67]:
pip install demjson

Collecting demjson
  Downloading https://files.pythonhosted.org/packages/96/67/6db789e2533158963d4af689f961b644ddd9200615b8ce92d6cad695c65a/demjson-2.2.4.tar.gz (131kB)
Building wheels for collected packages: demjson
  Building wheel for demjson (setup.py): started
  Building wheel for demjson (setup.py): finished with status 'done'
  Stored in directory: C:\Users\vishal.kumar1\AppData\Local\pip\Cache\wheels\c5\d2\ab\a54fb5ea53ac3badba098160e8452fa126a51febda80440ded
Successfully built demjson
Installing collected packages: demjson
Successfully installed demjson-2.2.4
Note: you may need to restart the kernel to use updated packages.


## Model Tuning - To find the best model for a particular solution
In the below example we will be forming a JSON and will be using it for Model Tuning and to find out which model suits best

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [71]:
model_params={
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C':[1,5,10]
        }
    }
}

scores=[]

In [76]:
for model_name, mp in model_params.items():
    clf=cross_val_score(mp['model'],mp['params'],cv=5)
    clf.fit(data_iris.data,data_iris.target)
    scores.append({
        'model': model_name,
        'best_score' : clf.best_score_,
        'best_params': clf.best_params_
    })

ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=2.