In [1]:
from sklearn import svm, datasets

iris = datasets.load_iris()

In [15]:
import numpy as np
import pandas as pd
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df[47:53]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor


Approach 1: Use train_test_split and manually tune parameters by trial and error

Here, the model accuracy is not stable since the split portion will change based on the execution. So, the score will change based on samples.

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [9]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9111111111111111

Approach 2: Use K Fold Cross validation...
Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation



In [11]:
from sklearn.model_selection import cross_val_score

cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [12]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [13]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

The above method is tiresome, since there are many values to change manually like kernal, C, gamma.. Therefore, we will use loops for such implementations.

In [16]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

By using the above for loop we can tune the hyper parameterr for obtaining the accuracy. But, here the samples are comparitively less so the parameters are less, but if the folds are increasing the no.of iterations, . So, this consumes a lot of computational time.

SO for mitigating all these issues, we gonna use gridsearchCV for doing the same operation which was shown in the above loop.

In [19]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']  #parameter grid 
}, cv=5, return_train_score=False) #cv = cross validation; return_train_score??
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.0027442 , 0.00158019, 0.00319276, 0.00232182, 0.00230808,
        0.00295205]),
 'std_fit_time': array([0.00185604, 0.00316038, 0.00391085, 0.00415501, 0.00416606,
        0.00361875]),
 'mean_score_time': array([0.00093822, 0.        , 0.        , 0.00140224, 0.        ,
        0.00026298]),
 'std_score_time': array([0.00053367, 0.        , 0.        , 0.00280447, 0.        ,
        0.00052595]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [32]:
df = pd.DataFrame(clf.cv_results_)
df #for readability

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00634,0.009191,0.016638,0.033277,1,{'C': 1},1.0,0.966667,0.933333,0.9,1.0,0.96,0.038873,3
1,0.002891,0.0038,0.000254,0.00041,5,{'C': 5},1.0,1.0,0.933333,0.9,1.0,0.966667,0.042164,1
2,0.003203,0.003924,0.000297,0.000595,10,{'C': 10},1.0,1.0,0.933333,0.9,1.0,0.966667,0.042164,1


In [21]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [22]:
clf.best_score_


0.9800000000000001

In [23]:
clf.best_estimator_

SVC(C=1, gamma='auto')

In [24]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

GridsearchCV also has a lot of computation cost. Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation



In [29]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,20,linear,0.966667


How about different models with different hyperparameters?

In [30]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [31]:

scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}


Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification