### Finding the best model and Hyper parameter tuning using Grid SreachCV and Randomized Search CV

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_iris

In [3]:
data=load_iris()

In [4]:
dataframe=pd.DataFrame(data.data,columns=data.feature_names)
dataframe.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
dataframe['flower']=data.target
dataframe.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
dataframe['flower']=dataframe['flower'].apply(lambda x: data.target_names[x])
dataframe.head(14)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


### Approach 1: Use train_test_split & paramater tuning by trail and error method

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data.data,data.target,test_size=0.3)

In [8]:
from sklearn.svm import SVC

model=SVC(kernel='linear',C=10,gamma='auto')

In [9]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.9777777777777777

### Approach 2: Using the cross validation and Tuning

In [10]:
from sklearn.model_selection import cross_val_score
cross_1=cross_val_score(SVC(kernel='linear',C=10,gamma='auto'),data.data,data.target,cv=5)
cross_1

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [11]:
cross_2=cross_val_score(SVC(kernel='linear',C=1,gamma='scale'),data.data,data.target,cv=5)
cross_2

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [12]:
cross_3=cross_val_score(SVC(kernel='rbf',C=10,gamma='auto'),data.data,data.target,cv=5)
cross_3

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [13]:
cross_4=cross_val_score(SVC(kernel='rbf',C=20,gamma='auto'),data.data,data.target,cv=5)
cross_4

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [14]:
cross_5=cross_val_score(SVC(kernel='rbf',C=20,gamma='scale'),data.data,data.target,cv=5)
cross_5

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

### Approach 3: Using the Loop in Cross Validation

In [15]:
kernels=['linear','rbf']
c_val=[1 ,10 ,20]

results={}

for k in kernels:
    for c in c_val:
        cv_score=cross_val_score(SVC(kernel=k,C=c,gamma='auto'),data.data,data.target,cv=5)
        results[k+' when C is '+str(c)]=np.average(cv_score)

results        

{'linear when C is 1': 0.9800000000000001,
 'linear when C is 10': 0.9733333333333334,
 'linear when C is 20': 0.9666666666666666,
 'rbf when C is 1': 0.9800000000000001,
 'rbf when C is 10': 0.9800000000000001,
 'rbf when C is 20': 0.9666666666666668}

#####  C=1 ,Linear and C=1,10 Rbf provided the best result

### Approach 4: GridSearchCV and the optimization

In [16]:
from sklearn.model_selection import GridSearchCV
clf=GridSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20],
    'kernel':['linear','rbf']
},cv=5,return_train_score=False)

In [17]:
clf.fit(data.data,data.target)
clf.cv_results_

{'mean_fit_time': array([0.00240111, 0.00160046, 0.00079932, 0.00080104, 0.00079985,
        0.00080037]),
 'std_fit_time': array([0.00196054, 0.00196016, 0.00159864, 0.00160208, 0.00159969,
        0.00160074]),
 'mean_score_time': array([0.00080109, 0.        , 0.0008009 , 0.00079956, 0.00080042,
        0.00080013]),
 'std_score_time': array([0.00160217, 0.        , 0.00160179, 0.00159912, 0.00160084,
        0.00160027]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'}],


In [18]:
data_result=pd.DataFrame(clf.cv_results_)
data_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002401,0.001961,0.000801,0.001602,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.0016,0.00196,0.0,0.0,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000799,0.001599,0.000801,0.001602,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
3,0.000801,0.001602,0.0008,0.001599,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.0008,0.0016,0.0008,0.001601,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
5,0.0008,0.001601,0.0008,0.0016,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [19]:
data_result[['param_kernel','param_C','mean_test_score']]

Unnamed: 0,param_kernel,param_C,mean_test_score
0,linear,1,0.98
1,rbf,1,0.98
2,linear,10,0.973333
3,rbf,10,0.98
4,linear,20,0.966667
5,rbf,20,0.966667


### GridSearchCV uses the possible combinations whereas the RandomizedSearchCV uses customized iteration

In [20]:
from sklearn.model_selection import RandomizedSearchCV

clf_rs=RandomizedSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20],
    'kernel':['linear','rbf']
   
} ,n_iter=2,cv=5,return_train_score=False)

In [21]:
clf_rs.fit(data.data,data.target)
df_rs=pd.DataFrame(clf_rs.cv_results_)
df_rs

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000804,0.001608,0.0016,0.00196,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.0008,0.001599,0.0016,0.00196,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,2


In [22]:
df_rs[['param_kernel','param_C','mean_test_score']]

Unnamed: 0,param_kernel,param_C,mean_test_score
0,rbf,10,0.98
1,rbf,20,0.966667


In [71]:
clf.best_estimator_
#same for df_rs

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [72]:
clf.best_params_
#same for df_rs

{'C': 10}

In [73]:
clf.best_score_

0.9666666666666668

### Best classifier finding :

- Support Vector Classifier
- Logistic Regression
- Random Forest Classifier

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [76]:
model_params={
    'SVC':{
        'model': SVC(gamma='auto'),
        'params':{
        'C':[1,10,20],
        'kernel':['linear','rbf']
            }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,10,20]
        }
        
    },
    'logistic_reg':{
        'model':LogisticRegression(solver='liblinear',multi_class='auto'),
        'params':{
            'C':[1,10,15]
        }
    }
}

In [81]:
scores=[]
for name,value in model_params.items():
    clf=GridSearchCV(value['model'],value['params'],cv=5,return_train_score=False)
    clf.fit(data.data,data.target)
    scores.append({
        'model':name,
        'Best_score':clf.best_score_,
        'Best_parameter':clf.best_params_
    })
scores

[{'model': 'SVC',
  'Best_score': 0.9800000000000001,
  'Best_parameter': {'C': 1, 'kernel': 'linear'}},
 {'model': 'random_forest',
  'Best_score': 0.96,
  'Best_parameter': {'n_estimators': 10}},
 {'model': 'logistic_reg',
  'Best_score': 0.9666666666666668,
  'Best_parameter': {'C': 10}}]

In [57]:
df_final=pd.DataFrame(scores,columns=['model','Best_score','Best_parameter'])
df_final

Unnamed: 0,model,Best_score,Best_parameter
0,SVC,0.98,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.966667,{'n_estimators': 40}
2,logistic_reg,0.966667,{'C': 10}
