In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import numpy as np
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

digits = load_digits()

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=42)

### Logistic Regression

In [3]:
lr = LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9537037037037037

### Support Vector Machine (SVM)

In [7]:
svm = SVC(gamma='scale')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.987037037037037

### Random Forest Classifier

In [9]:
rfc = RandomForestClassifier(n_estimators=40)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9740740740740741

### KFold cross validation

In [10]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3) # how many folds you wanna take

In [12]:
# basic example on how it works
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [16]:
# get score function
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [25]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

score_logistic = [] # Logistic Regression
score_svc = [] # Support Vector Machine
score_rf = [] # Random Forest

for train_index, test_index in folds.split(digits.data, digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    score_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_train, X_test, y_train, y_test))
    score_svc.append(get_score(SVC(gamma='scale'), X_train, X_test, y_train, y_test))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

In [26]:
score_logistic

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]

In [27]:
score_svc

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [28]:
score_rf

[0.9232053422370617, 0.9616026711185309, 0.9181969949916527]

### Cross Validation Score
Cross validation do the same thing as KFold, more easily.

In [29]:
from sklearn.model_selection import cross_val_score

# function for cross validation score
def cross_func(model):
    return cross_val_score(model, digits.data, digits.target, cv=3)

### Logistic Regression

In [30]:
cross_func(LogisticRegression(solver='liblinear',multi_class='ovr'))

array([0.89482471, 0.95325543, 0.90984975])

### Support Vector Machine (SVM)

In [31]:
cross_func(SVC(gamma='scale'))

array([0.96494157, 0.97996661, 0.96494157])

### Random Forest Classifier

In [38]:
cross_func(RandomForestClassifier(n_estimators=10))

array([0.91819699, 0.93155259, 0.87813022])

In [41]:
cross_func(RandomForestClassifier(n_estimators=40))

array([0.93155259, 0.95325543, 0.93155259])

In [35]:
cross_func(RandomForestClassifier(n_estimators=60))

array([0.93489149, 0.96327212, 0.92654424])

In [36]:
cross_func(RandomForestClassifier(n_estimators=100))

array([0.92821369, 0.94991653, 0.92487479])

**n_estimators=40** has high accuracy score: **0.93155259, 0.95325543, 0.93155259**

### Parameter tunning for Random Forest

In [43]:
score1 = cross_val_score(RandomForestClassifier(n_estimators=5), digits.data, digits.target, cv=10)
np.average(score1)

0.8731346989447548

In [44]:
score2 = cross_val_score(RandomForestClassifier(n_estimators=10), digits.data, digits.target, cv=10)
np.average(score2)

0.9159931719428925

In [45]:
score3 = cross_val_score(RandomForestClassifier(n_estimators=30), digits.data, digits.target, cv=10)
np.average(score3)

0.938227808814401

In [46]:
score4 = cross_val_score(RandomForestClassifier(n_estimators=40), digits.data, digits.target, cv=10)
np.average(score4)

0.9476970825574178

In [47]:
score5 = cross_val_score(RandomForestClassifier(n_estimators=60), digits.data, digits.target, cv=10)
np.average(score5)

0.9449162011173184

Here, we can see **n_estimators=40** meaning 40 trees in random forest gives best result.