# Cross Validation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
from sklearn.datasets import load_digits

In [3]:
digits = load_digits()

In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size = 0.3)

In [5]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='ovr',solver='liblinear')
lr.fit(x_train,y_train)
lr.score(x_test,y_test)

0.9592592592592593

In [6]:
from sklearn.ensemble import RandomForestClassifier
rc = RandomForestClassifier(n_estimators=40)
rc.fit(x_train,y_train)
rc.score(x_test,y_test)

0.9833333333333333

In [7]:
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc.fit(x_train,y_train)
svc.score(x_test,y_test)

0.4148148148148148

The above methods we can see that every time the accuracy is changed after executing the train_test_split method

# KFold cross validation


Basic example

In [8]:
from sklearn.model_selection import KFold
kf  = KFold(n_splits=3)

In [9]:
for train_index, test_index in kf.split([1,2,3,4,5,66,7,7,7,8,9]):
    print(train_index,test_index)

[ 4  5  6  7  8  9 10] [0 1 2 3]
[ 0  1  2  3  8  9 10] [4 5 6 7]
[0 1 2 3 4 5 6 7] [ 8  9 10]


From our examples

In [10]:
def getscore(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    return model.score(x_test,y_test)

In [11]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data,digits.target):
    x_train, x_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                       digits.target[train_index], digits.target[test_index]
    scores_logistic.append(getscore(LogisticRegression(solver='liblinear',multi_class='ovr'), x_train, x_test, y_train, y_test))  
    scores_svm.append(getscore(SVC(gamma='auto'), x_train, x_test, y_train, y_test))
    scores_rf.append(getscore(RandomForestClassifier(n_estimators=40), x_train, x_test, y_train, y_test))

In [12]:
scores_logistic

[0.8953488372093024, 0.9499165275459098, 0.9093959731543624]

In [13]:
scores_svm

[0.39368770764119604, 0.41068447412353926, 0.4597315436241611]

In [14]:
scores_rf

[0.9235880398671097, 0.9365609348914858, 0.9278523489932886]

or we can simply call a method from sklearn library

In [60]:
from sklearn.model_selection import cross_val_score
cv = cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'),digits.data,digits.target,cv=folds)
cv

array([0.89534884, 0.94991653, 0.90939597])

In [53]:
cv = cross_val_score(SVC(gamma='auto'),digits.data,digits.target,cv=5)
cv

array([0.43406593, 0.40883978, 0.41504178, 0.48739496, 0.49859155])

In [56]:
cv = cross_val_score(RandomForestClassifier(n_estimators=40),digits.data,digits.target,cv=5)
cv

array([0.91483516, 0.90055249, 0.95264624, 0.95238095, 0.92112676])

In [21]:
from sklearn.linear_model import LogisticRegression                 # import the estimator
model = LogisticRegression(multi_class='ovr',solver='liblinear')
from sklearn.metrics import accuracy_score                          # import accuracy
x = digits.data
y = digits.target
from sklearn.model_selection import StratifiedKFold                 # import stratified k fold CV
folds = StratifiedKFold(n_splits=3)
accuracy = []
for train_index,test_index in folds.split(x,y):                     # train,index and test index are the 2 arguments
    x_train,x_test,y_train,y_test = x[train_index],x[test_index],y[train_index],y[test_index] # define the training and testing parameters
    model.fit(x_train,y_train)   
    predictions = model.predict(x_test) 
    score = accuracy_score(y_test,predictions)
    accuracy.append(score)                                          #append values to the empty list

In [22]:
accuracy

[0.8953488372093024, 0.9499165275459098, 0.9093959731543624]

In [23]:
import numpy as np
acc = np.array(accuracy).mean()

In [24]:
acc

0.9182204459698582

Hence our mean accuracy for stratifiedKFold cross validation is 91.2%

In [38]:
# loocv evaluate random forest on the sonar dataset
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
data = dataframe.values
# split into inputs and outputs


In [42]:
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)

(208, 60) (208,)


In [51]:
# create loocv procedure
cv = LeaveOneOut()
# create model
model = RandomForestClassifier()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.798 (0.401)
