# KFold Cross Validation Python Tutorial
---

In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [63]:
digits = load_digits()

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size= 0.2)

In [65]:
X_train.shape

(1437, 64)

In [66]:
y_train.shape

(1437,)

In [67]:
X_test.shape

(360, 64)

# Logistic Regreesion

In [68]:
lr = LogisticRegression(solver = 'liblinear', multi_class = 'ovr')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9694444444444444

# SVM

In [69]:
svm = SVC(gamma= 'auto')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.375

# Random Forest

In [70]:
rf = RandomForestClassifier(n_estimators= 40)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.975

# KFold cross validation

**Basic example**

In [71]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [72]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


# Use KFold for our digits example

In [73]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [74]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits= 3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data, digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index],digits.target[train_index], digits.target[test_index]
    
    scores_logistic.append(get_score(LogisticRegression(solver = 'liblinear', multi_class = 'ovr'),X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(C = 10, gamma = 'auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators= 40), X_train, X_test, y_train, y_test))
    

In [76]:
scores_logistic

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]

In [77]:
scores_svm

[0.4040066777963272, 0.43906510851419034, 0.5375626043405676]

In [78]:
scores_rf

[0.9148580968280468, 0.9348914858096828, 0.9265442404006677]

# Cross_val_score funtion

In [80]:
from sklearn.model_selection import cross_val_score

# LogisticRegression model performance using cross_val_score

In [81]:
cross_val_score(LogisticRegression(solver = 'liblinear', multi_class = 'ovr'), digits.data, digits.target, cv= 3)

array([0.89482471, 0.95325543, 0.90984975])

# SVM model performance using cross_val_score


In [82]:
cross_val_score(SVC(gamma = 'auto'), digits.data, digits.target, cv= 3)

array([0.38063439, 0.41068447, 0.51252087])

# Random forsest performance using cross_val_score

In [83]:
cross_val_score(RandomForestClassifier(n_estimators= 40), digits.data, digits.target, cv= 3)

array([0.92487479, 0.94657763, 0.92988314])

# Parameter tunning using K fold cross validation

In [84]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators= 5), digits.data, digits.target, cv= 10)
np.average(scores1)

0.8575481067659838

In [86]:
scores2 = cross_val_score(RandomForestClassifier(n_estimators= 20), digits.data, digits.target, cv= 10)
np.average(scores2)

0.9371073867163252

In [87]:
scores3 = cross_val_score(RandomForestClassifier(n_estimators= 30), digits.data, digits.target, cv= 10)
np.average(scores3)

0.9437926753569211

In [88]:
scores4 = cross_val_score(RandomForestClassifier(n_estimators= 40), digits.data, digits.target, cv= 10)
np.average(scores4)

0.9488144009931719

Here we used cross_val_score to fine tune our random forest classifier and figure that having around 40 trees in random forest gives that result

# Exercise 

Use iris flower datasets from sklearn library and use cross_val_score against following models to measure the performance of each. In the end figure out the model with best performance

1. Logistic Regression
2. SVM
3. Decision Tree
4. Random Forset

In [89]:
from sklearn.datasets import load_iris

iris = load_iris()

In [90]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [91]:
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [92]:
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [93]:
X = df.drop('target', axis = 'columns')
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [95]:
y = df['target']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 , random_state= 124)

In [98]:
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
131,7.9,3.8,6.4,2.0
143,6.8,3.2,5.9,2.3
75,6.6,3.0,4.4,1.4
14,5.8,4.0,1.2,0.2
29,4.7,3.2,1.6,0.2


In [99]:
y_train.head()

131    2
143    2
75     1
14     0
29     0
Name: target, dtype: int64

In [101]:
lr = cross_val_score(LogisticRegression(solver= 'liblinear', multi_class= 'ovr'), X_train, y_train, cv= 10)
np.average(lr)

0.9666666666666666

In [102]:
svm = cross_val_score(SVC(gamma = 'auto'), X_train, y_train, cv= 10)
np.average(svm)

0.9916666666666666

In [104]:
tree = cross_val_score(DecisionTreeClassifier(),X_train, y_train, cv= 10)
np.average(tree)

0.9916666666666666

In [114]:
random = cross_val_score(RandomForestClassifier(n_estimators= 80), X_train, y_train, cv= 10)
np.average(random)

0.9833333333333332