# Applying Machine Learning Process: Cross-Validation

* Parameter tuning
* Feature selection
* Model selection

In [None]:
from sklearn.cross_validation import KFold

kf = KFold(25, n_folds=5, shuffle=False)

print('{} {:^65} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^10} {} {:^30}'.format(iteration, str(data[0]), str(data[1])))

In [None]:
from sklearn.cross_validation import cross_val_score

knn = KNeighborsClassifier(n_neighbors=20)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(scores)

In [None]:
print(scores.mean())

Let's try varying the value for K.

In [None]:
k_range = range(1, 31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())

print(k_scores)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

### Searching for Optimal Model Parameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
k_range = list(range(1, 31))
print(k_range)

In [None]:
param_grid = dict(n_neighbors=k_range)
print(param_grid)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

X = iris.data
y = iris.target

knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=True)
grid.fit(X, y)
grid.cv_results_

In [None]:
grid_mean_scores = []
for result in grid.cv_results_['mean_test_score']:
    grid_mean_scores.append(result)

In [None]:
grid_mean_scores = [result for result in grid.cv_results_['mean_test_score']]
print(grid_mean_scores)

In [None]:
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)