# KNN for MNIST Classification

## Load the dataset

In [1]:
#Set up
import sklearn

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
mnist.data.shape

(70000, 784)

In [6]:
X, y = mnist["data"], mnist["target"]

## Split the Data into Train and Test Sets

The MNIST dataset is actually already split into a training set (the first 60,000 images) and a test set (the last 10,000 images):

In [7]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## Split the Train Data into Train and Validation Sets

The two hyperparameters in KNN Classifier is the number of neighbors parameter k and the distance measure. In this case we are assuming Euclidian distance as the distance measure. The tunable parameter is k. To tune the optimal value of k we will use the validation data set.

In [8]:
from sklearn.model_selection import train_test_split
trainData,valData,trainLabel,valLabel = train_test_split(X_train,y_train,test_size=0.1,random_state=84)

In [10]:
print("training data points: {}".format(len(trainLabel)))
print("validation data points: {}".format(len(valLabel)))
print("testing data points: {}".format(len(y_test)))

training data points: 54000
validation data points: 6000
testing data points: 10000


## Select the best K with Train and Validation Sets

In [12]:
from sklearn.neighbors import KNeighborsClassifier

kVals = np.arange(1,20,2)

for k in kVals:
    
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(trainData,trainLabel)
    
    # evaluate the model and update the accuracies list
    score = model.score(valData, valLabel)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))


k=1, accuracy=97.48%
k=3, accuracy=97.62%
k=5, accuracy=97.28%
k=7, accuracy=97.10%
k=9, accuracy=97.03%
k=11, accuracy=97.03%
k=13, accuracy=96.77%
k=15, accuracy=96.60%
k=17, accuracy=96.43%
k=19, accuracy=96.20%


## Make Prediction on Test Set

In [16]:
# choose k = 3
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [19]:
predictions = model.predict(X_test)

In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.96      1.00      0.98      1135
           2       0.98      0.97      0.97      1032
           3       0.96      0.97      0.96      1010
           4       0.98      0.97      0.97       982
           5       0.97      0.96      0.96       892
           6       0.98      0.99      0.98       958
           7       0.96      0.96      0.96      1028
           8       0.99      0.94      0.96       974
           9       0.96      0.96      0.96      1009

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

