In [1]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.datasets import load_digits
import numpy as np

digit = load_digits()

In [2]:
dir(digit)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [3]:
df = pd.DataFrame(digit.data,columns=digit.feature_names)

new_columns = [f"{i:02d}" for i in range(len(df.columns))] # make the column header more readable
df.columns = new_columns
df['target'] = digit.target
df.head()

Unnamed: 0,00,01,02,03,04,05,06,07,08,09,...,55,56,57,58,59,60,61,62,63,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


# Approach 1: Use train_test_split and manually tune parameters by trial and error

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(digit.data, digit.target, test_size=0.3)

In [5]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(x_train,y_train)
model.score(x_test, y_test)

0.4703703703703704

# Approach 2: Use K Fold Cross validation

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),digit.data, digit.target, cv=5)

array([0.96388889, 0.91944444, 0.96657382, 0.9637883 , 0.92479109])

In [8]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),digit.data, digit.target, cv=5)

array([0.45277778, 0.46944444, 0.47910864, 0.47910864, 0.50139276])

# Above approach is tiresome and very manual. We can use for loop as an alternative

In [9]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),digit.data, digit.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.448545341999381,
 'rbf_10': 0.47636645001547506,
 'rbf_20': 0.47636645001547506,
 'linear_1': 0.9476973073351903,
 'linear_10': 0.9476973073351903,
 'linear_20': 0.9476973073351903}

# From above results we can say that linear with C=1,10,20 will give best performance

# Approach 3: Use GridSearchCV
#### GridSearchCV does exactly same thing as for loop above but in a single line of code

In [10]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(digit.data, digit.target)
clf.cv_results_

{'mean_fit_time': array([0.7118012 , 0.07051802, 0.68494787, 0.06821103, 0.7040585 ,
        0.06883073]),
 'std_fit_time': array([0.02141518, 0.00334267, 0.00884014, 0.00319894, 0.02081766,
        0.0029006 ]),
 'mean_score_time': array([0.19651728, 0.01840215, 0.19083848, 0.02280755, 0.19484072,
        0.0180017 ]),
 'std_score_time': array([0.01128417, 0.00049062, 0.00495773, 0.00523807, 0.0183022 ,
        0.00063181]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [11]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.711801,0.021415,0.196517,0.011284,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.411111,0.45,0.454039,0.448468,0.479109,0.448545,0.021761,6
1,0.070518,0.003343,0.018402,0.000491,1,linear,"{'C': 1, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1
2,0.684948,0.00884,0.190838,0.004958,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.452778,0.469444,0.479109,0.479109,0.501393,0.476366,0.015784,4
3,0.068211,0.003199,0.022808,0.005238,10,linear,"{'C': 10, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1
4,0.704059,0.020818,0.194841,0.018302,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.452778,0.469444,0.479109,0.479109,0.501393,0.476366,0.015784,4
5,0.068831,0.002901,0.018002,0.000632,20,linear,"{'C': 20, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1


In [12]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.448545
1,1,linear,0.947697
2,10,rbf,0.476366
3,10,linear,0.947697
4,20,rbf,0.476366
5,20,linear,0.947697


In [13]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [14]:
clf.best_score_

0.9476973073351903

# Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [15]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(digit.data, digit.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.448545
1,10,linear,0.947697


# different models with different hyperparameters

In [16]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20, 30],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
        }
    },
    'Naive Bayes GaussianNB': {
        'model': GaussianNB(),
        'params': {}
    },
    'Naive Bayes MultinomialNB': {
        'model': MultinomialNB(),
        'params': {}
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'Linear_Regression': {
        'model': LinearRegression(),
        'params': {}
    }
}    

In [17]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digit.data, digit.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.909301,{'n_estimators': 10}
2,DecisionTreeClassifier,0.808579,{'criterion': 'entropy'}
3,Naive Bayes GaussianNB,0.806928,{}
4,Naive Bayes MultinomialNB,0.87035,{}
5,logistic_regression,0.922114,{'C': 1}
6,Linear_Regression,0.506557,{}


# For me the winner is svm (C=1, kernel=linear) with 94.76% score. It could be different for you as I have limited my parameters to be certain values only