# Cross-validation

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np



In [2]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [4]:
cross_val_score(KNeighborsClassifier(), X_train, y_train, cv=5)

array([ 0.97435897,  0.99632353,  0.98888889,  0.98876404,  0.98113208])

In [5]:
cross_val_score(KNeighborsClassifier(), X_train, y_train, cv=10)

array([ 0.99285714,  0.95714286,  1.        ,  0.99270073,  0.98518519,
        0.9924812 ,  0.99242424,  0.99236641,  0.98461538,  0.97692308])

Grid Searches
=================

Grid-Search with build-in cross validation

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

Define parameter grid:

In [7]:
import numpy as np

param_grid = {'C': 10. ** np.arange(-3, 3),
              'gamma' : 10. ** np.arange(-5, 0)}
              

np.set_printoptions(suppress=True)
print(param_grid)

{'C': array([   0.001,    0.01 ,    0.1  ,    1.   ,   10.   ,  100.   ]), 'gamma': array([ 0.00001,  0.0001 ,  0.001  ,  0.01   ,  0.1    ])}


In [8]:
grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5, n_jobs=-1)

A GridSearchCV object behaves just like a normal classifier.

In [9]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, gamma=1e-05 ............................................
[CV] C=0.001, gamma=1e-05 ............................................
[CV] C=0.001, gamma=1e-05 ............................................
[CV] C=0.001, gamma=1e-05 ............................................
[CV] C=0.001, gamma=1e-05 ............................................
[CV] C=0.001, gamma=0.0001 ...........................................
[CV] C=0.001, gamma=0.0001 ...........................................
[CV] C=0.001, gamma=0.0001 ...........................................
[CV] ............. C=0.001, gamma=1e-05, score=0.106227, total=   0.3s
[CV] ............. C=0.001, gamma=1e-05, score=0.106618, total=   0.3s
[CV] C=0.001, gamma=0.0001 ...........................................
[CV] ............. C=0.001, gamma=1e-05, score=0.107407, total=   0.3s
[CV] C=0.001, gamma=0.0001 ...........................................
[CV] C=0.001, g

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s


[CV] .............. C=0.001, gamma=0.01, score=0.107407, total=   0.3s
[CV] C=0.001, gamma=0.1 ..............................................
[CV] .............. C=0.001, gamma=0.01, score=0.106618, total=   0.4s
[CV] .............. C=0.001, gamma=0.01, score=0.108614, total=   0.3s
[CV] C=0.01, gamma=1e-05 .............................................
[CV] C=0.01, gamma=1e-05 .............................................
[CV] .............. C=0.001, gamma=0.01, score=0.105660, total=   0.3s
[CV] C=0.01, gamma=1e-05 .............................................
[CV] ............... C=0.001, gamma=0.1, score=0.106227, total=   0.3s
[CV] C=0.01, gamma=1e-05 .............................................
[CV] ............... C=0.001, gamma=0.1, score=0.106618, total=   0.3s
[CV] C=0.01, gamma=1e-05 .............................................
[CV] ............... C=0.001, gamma=0.1, score=0.107407, total=   0.3s
[CV] ............... C=0.001, gamma=0.1, score=0.108614, total=   0.3s
[CV] C

[CV] C=1.0, gamma=0.0001 .............................................
[CV] ............... C=1.0, gamma=1e-05, score=0.896296, total=   0.3s
[CV] C=1.0, gamma=0.0001 .............................................
[CV] C=1.0, gamma=0.0001 .............................................
[CV] ............... C=1.0, gamma=1e-05, score=0.876404, total=   0.2s
[CV] ............... C=1.0, gamma=1e-05, score=0.867925, total=   0.2s
[CV] ................. C=0.1, gamma=0.1, score=0.108614, total=   0.4s
[CV] C=1.0, gamma=0.001 ..............................................
[CV] C=1.0, gamma=0.0001 .............................................
[CV] C=1.0, gamma=0.001 ..............................................
[CV] ................. C=0.1, gamma=0.1, score=0.105660, total=   0.4s
[CV] C=1.0, gamma=0.001 ..............................................
[CV] .............. C=1.0, gamma=0.0001, score=0.970696, total=   0.1s
[CV] C=1.0, gamma=0.001 ..............................................
[CV] .

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    6.4s


[CV] ............... C=10.0, gamma=0.01, score=0.787546, total=   0.4s
[CV] C=10.0, gamma=0.1 ...............................................
[CV] ............... C=10.0, gamma=0.01, score=0.783088, total=   0.4s
[CV] C=10.0, gamma=0.1 ...............................................
[CV] ............... C=10.0, gamma=0.01, score=0.874074, total=   0.4s
[CV] ............... C=10.0, gamma=0.01, score=0.820225, total=   0.4s
[CV] C=100.0, gamma=1e-05 ............................................
[CV] C=100.0, gamma=1e-05 ............................................
[CV] ............... C=10.0, gamma=0.01, score=0.841509, total=   0.3s
[CV] C=100.0, gamma=1e-05 ............................................
[CV] ................ C=10.0, gamma=0.1, score=0.106227, total=   0.4s
[CV] C=100.0, gamma=1e-05 ............................................
[CV] ............. C=100.0, gamma=1e-05, score=0.974359, total=   0.1s
[CV] ................ C=10.0, gamma=0.1, score=0.106618, total=   0.4s
[CV] .

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    8.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([   0.001,    0.01 ,    0.1  ,    1.   ,   10.   ,  100.   ]), 'gamma': array([ 0.00001,  0.0001 ,  0.001  ,  0.01   ,  0.1    ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [10]:
grid_search.predict(X_test)

array([8, 5, 9, 0, 5, 2, 9, 6, 3, 5, 3, 7, 1, 3, 8, 4, 8, 7, 1, 8, 3, 7, 6,
       7, 7, 1, 4, 3, 9, 2, 9, 6, 4, 0, 8, 0, 0, 3, 3, 2, 6, 5, 7, 0, 0, 4,
       1, 6, 0, 6, 7, 6, 9, 3, 1, 0, 2, 5, 4, 5, 6, 9, 1, 7, 4, 2, 8, 8, 8,
       0, 5, 7, 1, 9, 5, 0, 5, 1, 9, 8, 2, 9, 0, 2, 2, 7, 0, 0, 6, 3, 6, 4,
       5, 7, 0, 4, 1, 5, 8, 9, 6, 7, 7, 7, 1, 0, 6, 6, 6, 5, 8, 7, 0, 6, 4,
       3, 0, 0, 1, 8, 2, 6, 3, 0, 6, 5, 2, 6, 0, 3, 8, 8, 7, 3, 7, 1, 0, 2,
       8, 8, 5, 3, 3, 3, 1, 9, 1, 7, 0, 7, 4, 5, 4, 4, 7, 7, 8, 7, 3, 6, 4,
       9, 2, 2, 3, 9, 2, 4, 5, 4, 5, 3, 7, 9, 4, 1, 8, 6, 8, 7, 9, 7, 5, 1,
       4, 2, 6, 6, 5, 6, 9, 6, 1, 9, 6, 7, 7, 1, 6, 9, 7, 7, 7, 3, 5, 2, 2,
       1, 5, 4, 9, 8, 7, 4, 2, 2, 2, 1, 6, 1, 9, 1, 4, 7, 2, 5, 8, 8, 0, 9,
       8, 3, 9, 4, 8, 6, 9, 3, 5, 0, 4, 3, 6, 4, 0, 2, 0, 5, 2, 2, 2, 5, 7,
       2, 6, 6, 3, 3, 2, 3, 6, 4, 0, 4, 0, 5, 5, 3, 2, 5, 3, 0, 6, 7, 7, 2,
       1, 1, 1, 3, 4, 9, 9, 6, 4, 3, 5, 7, 0, 2, 3, 8, 4, 5, 8, 5, 6, 4, 1,
       9, 1,

In [11]:
grid_search.score(X_test, y_test)

0.98666666666666669

In [12]:
grid_search.best_params_

{'C': 1.0, 'gamma': 0.001}

In [13]:
# We extract just the scores

scores = grid_search.cv_results_['mean_test_score']
scores = np.array(scores).reshape(6, 5)

plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(5), param_grid['gamma'])
plt.yticks(np.arange(6), param_grid['C']);

<IPython.core.display.Javascript object>

# Exercises
Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier.

In [28]:
param_grid = {'n_neighbors': np.arange(1, 10),
             'weights': ['uniform', 'distance']}

from sklearn.neighbors import KNeighborsClassifier
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=3, cv=5, n_jobs=23)
grid_search.fit(X_train, y=y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] n_neighbors=1, weights=uniform ..................................
[CV] n_neighbors=1, weights=uniform ..................................
[CV] n_neighbors=1, weights=uniform ..................................
[CV] n_neighbors=1, weights=uniform ..................................
[CV] n_neighbors=1, weights=distance .................................
[CV] n_neighbors=1, weights=uniform ..................................
[CV] n_neighbors=1, weights=distance .................................
[CV] n_neighbors=1, weights=distance .................................
[CV] ... n_neighbors=1, weights=uniform, score=0.974359, total=   0.0s
[CV] n_neighbors=1, weights=distance .................................
[CV] ... n_neighbors=1, weights=uniform, score=0.996324, total=   0.0s
[CV] ... n_neighbors=1, weights=uniform, score=0.996296, total=   0.0s
[CV] n_neighbors=1, weights=distance .................................
[CV] ... n_neigh

[CV] n_neighbors=9, weights=distance .................................
[CV] ... n_neighbors=6, weights=uniform, score=0.974359, total=   0.1s
[CV] .. n_neighbors=5, weights=distance, score=0.988764, total=   0.2s
[CV] .. n_neighbors=6, weights=distance, score=0.996324, total=   0.1s
[CV] ... n_neighbors=6, weights=uniform, score=0.973585, total=   0.2s
[CV] ... n_neighbors=6, weights=uniform, score=0.988889, total=   0.1s
[CV] n_neighbors=6, weights=uniform ..................................
[CV] n_neighbors=6, weights=distance .................................
[CV] n_neighbors=5, weights=distance .................................
[CV] n_neighbors=6, weights=uniform ..................................
[CV] n_neighbors=6, weights=distance .................................
[CV] .. n_neighbors=6, weights=distance, score=0.988764, total=   0.1s
[CV] n_neighbors=6, weights=distance .................................


[Parallel(n_jobs=23)]: Done  45 out of  90 | elapsed:    1.7s remaining:    1.7s


[CV] ... n_neighbors=7, weights=uniform, score=0.988889, total=   0.2s
[CV] ... n_neighbors=7, weights=uniform, score=0.978022, total=   0.2s
[CV] n_neighbors=7, weights=uniform ..................................
[CV] n_neighbors=7, weights=uniform ..................................
[CV] ... n_neighbors=7, weights=uniform, score=0.977358, total=   0.1s
[CV] n_neighbors=7, weights=distance .................................
[CV] ... n_neighbors=8, weights=uniform, score=0.988889, total=   0.1s
[CV] .. n_neighbors=7, weights=distance, score=0.992647, total=   0.2s
[CV] n_neighbors=7, weights=distance .................................
[CV] ... n_neighbors=8, weights=uniform, score=0.978022, total=   0.1s
[CV] n_neighbors=8, weights=uniform ..................................
[CV] .. n_neighbors=7, weights=distance, score=0.988764, total=   0.1s
[CV] n_neighbors=8, weights=uniform ..................................
[CV] n_neighbors=7, weights=distance .................................
[CV] .

[Parallel(n_jobs=23)]: Done  76 out of  90 | elapsed:    2.9s remaining:    0.5s
[Parallel(n_jobs=23)]: Done  90 out of  90 | elapsed:    2.9s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=23,
       param_grid={'n_neighbors': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [22]:
grid_search.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

In [27]:
import sys
print(sys.version_info)
import multiprocessing
print(multiprocessing.get_start_method())

sys.version_info(major=3, minor=6, micro=0, releaselevel='final', serial=0)
fork


In [16]:
grid_search.best_score_

0.98663697104677062

In [17]:
grid_search.score(X_train, y_train)

1.0

In [18]:
grid_search.score(X_test, y_test)

0.97999999999999998

In [19]:
# For a single parameter (as in n_neighbors)
#scores = grid_search.cv_results_['mean_test_score']
#plt.figure()
#plt.plot(scores, 'p')

scores = grid_search.cv_results_['mean_test_score']
scores = np.array(scores).reshape(len(param_grid['n_neighbors']), len(param_grid['weights']))
#plt.figure()
plt.matshow(scores)
plt.ylabel('n_neighbors')
plt.xlabel('weights')
plt.colorbar()
plt.yticks(np.arange(len(param_grid['n_neighbors'])), param_grid['n_neighbors'])
plt.xticks(np.arange(len(param_grid['weights'])), param_grid['weights']);

<IPython.core.display.Javascript object>

In [20]:
# %load solutions/grid_search_k_neighbors.py