In [0]:
%matplotlib inline


Nearest neighbor classification of the MNIST digits dataset
=============================================================

Plot the first few samples of the digits dataset, then perform the k-nearest neibor classification



In [0]:
import numpy as np
from sklearn.datasets import fetch_mldata
digits = fetch_mldata('MNIST original')
digits.images = np.reshape(digits.data, (digits.data.shape[0],28,28))
digits.target = np.int64(digits.target)
print(digits.data.shape)

Plot the data: images of digits
-------------------------------


In [0]:
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(6, 6))  # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
print("%d images" % len(digits.target))

p = np.random.randint(0, len(digits.data), 64)
for i in range(64):
    ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
    ax.imshow(digits.images[p[i]], cmap=plt.cm.gray, interpolation='nearest')
    # label the image with the target value
    ax.text(0, 7, str(digits.target[p[i]]), color='white')

Classify with k-nearest neighbors
----------------------------------



In [0]:
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split

# split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=200)

# train the model
k = 7   # try k = 1, 3, 7, 15, 31
clf = neighbors.KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train, y_train)

# use the model to predict the labels of the test data
predicted = clf.predict(X_test)
expected = y_test

# Plot the prediction
fig = plt.figure(figsize=(6, 6))  # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# plot the digits
for i in range(64):
    ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
    ax.imshow(X_test.reshape(-1, digits.images.shape[1], digits.images.shape[2])[i], cmap=plt.cm.gray,
              interpolation='nearest')

    # label the image with the target value
    ax.text(0, 7, str(expected[i]), color='white')
    if predicted[i] == expected[i]:
        ax.text(21, 7, str(predicted[i]), color='#a0ffa0')
    else:
        ax.text(21, 7, str(predicted[i]), color='red')

# the number of correct matches / the total number of data points
matches = (predicted == expected)
print("%d / %d = %2.1f %%" % (matches.sum(), len(matches), 100*matches.sum()/float(len(matches))))

Quantify the performance detail
------------------------



Print the classification report



In [0]:
from sklearn import metrics
print(metrics.classification_report(expected, predicted))

Print the confusion matrix



In [0]:
print(metrics.confusion_matrix(expected, predicted))

plt.show()