Cross-validation
---

In [1]:
from sklearn import datasets

# Load data set
iris = datasets.load_iris()

# Create X/y arrays
X = iris['data']
y = iris['target']

In [2]:
from sklearn.model_selection import KFold

# Create k-fold object
kfold = KFold(n_splits=3)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a logistic regression
logreg = make_pipeline(
    StandardScaler(), LogisticRegression(C=1000, solver='liblinear', multi_class='ovr'))

In [4]:
from sklearn.model_selection import cross_validate

# Apply cross-validation
scores = cross_validate(logreg, X, y, cv=kfold, return_train_score=True)

# The "scores" variable is a dictionary with the scores
scores.keys()

dict_keys(['fit_time', 'score_time', 'test_score', 'train_score'])

In [5]:
print('Train scores:', scores['train_score'])
print('Test scores:', scores['test_score'])

Train scores: [0.98 1.   1.  ]
Test scores: [0. 0. 0.]


In [6]:
# Target variable
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
# Create k-fold object
kfold = KFold(n_splits=3, shuffle=True, random_state=0)

In [8]:
# Apply cross-validation
scores = cross_validate(logreg, X, y, cv=kfold, return_train_score=True)

print('Train scores:', scores['train_score'])
print('Test scores:', scores['test_score'])

Train scores: [0.98 0.96 0.98]
Test scores: [0.98 0.96 0.96]


In [9]:
# Apply cross-validation
scores = cross_validate(logreg, X, y, cv=3, return_train_score=True)

print('Train scores:', scores['train_score'])
print('Test scores:', scores['test_score'])

Train scores: [0.94949495 1.         0.97058824]
Test scores: [0.98039216 0.92156863 0.97916667]


In [10]:
import numpy as np

# Apply cross-validation
scores = cross_validate(logreg, X, y, cv=10, return_train_score=True)

# Get train/test scores
train_scores = scores['train_score']
test_scores = scores['test_score']

# Print summary
print('Train mean: {:.3f} std: {:.3f}'.format(
    np.mean(train_scores), np.std(train_scores)))

print('Test mean: {:.3f} std: {:.3f}'.format(
    np.mean(test_scores), np.std(test_scores)))

Train mean: 0.975 std: 0.008
Test mean: 0.960 std: 0.061


In [11]:
from sklearn.model_selection import ShuffleSplit

# Try "shuffle split" strategy
cv_type = ShuffleSplit(n_splits=10, test_size=20, random_state=0)

# Apply cross-validation
scores = cross_validate(logreg, X, y, cv=cv_type, return_train_score=True)

# Test scores
test_scores = scores['test_score']
print('Test mean: {:.3f} std: {:.3f}'.format(np.mean(test_scores), np.std(test_scores)))

Test mean: 0.955 std: 0.052
