In [13]:
# K-Nearest Neighbor Classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from time import time
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# load the MNIST digits dataset
mnist = datasets.load_digits()
X = pd.DataFrame(mnist.data)
print (X.shape)

(1797, 64)


In [14]:
# Training and testing split,
(X_train, X_test, Y_train, Y_test) = train_test_split(np.array(mnist.data), mnist.target, test_size=0.25,random_state=42)

# take 10% of the training data and use that for validation
(X_train, valData, Y_train, valLabels) = train_test_split(X_train, Y_train, test_size=0.25, random_state=84)

In [15]:
kVals = range(1, 30, 2)
accuracies = []
for k in range(1, 30, 2):
    # train the classifier with the current value of `k`
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, Y_train)

    # evaluate the model and print the accuracies list
    score = model.score(valData, valLabels)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    accuracies.append(score)

# largest accuracy
# np.argmax returns the indices of the maximum values along an axis
i = np.argmax(accuracies)
print("k=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i],
    accuracies[i] * 100))


# Now that I know the best value of k, re-train the classifier
model = KNeighborsClassifier(n_neighbors=kVals[i])
t0 = time()
model.fit(X_train, Y_train)
print("training time:", round(time()-t0, 5), "s")
# Predict labels for the test set
t1 = time()
predictions = model.predict(X_test)
print("predicting time:", round(time()-t1, 5), "s")
# Evaluate performance of model for each of the digits
print("Classification Report")
print(classification_report(Y_test, predictions))

print("Overall Accuray")
print(accuracy_score(Y_test, predictions)*100,'%')

print("Confusion Matrix")
print(confusion_matrix(Y_test, predictions))

k=1, accuracy=98.81%
k=3, accuracy=98.22%
k=5, accuracy=97.63%
k=7, accuracy=97.63%
k=9, accuracy=97.03%
k=11, accuracy=97.63%
k=13, accuracy=97.03%
k=15, accuracy=97.03%
k=17, accuracy=96.44%
k=19, accuracy=96.14%
k=21, accuracy=95.85%
k=23, accuracy=95.55%
k=25, accuracy=95.55%
k=27, accuracy=94.96%
k=29, accuracy=95.25%
k=1 achieved highest accuracy of 98.81% on validation data
training time: 0.00298 s
predicting time: 0.04687 s
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.97      1.00      0.99        37
           2       1.00      1.00      1.00        38
           3       0.98      0.98      0.98        46
           4       0.98      0.98      0.98        55
           5       0.98      0.98      0.98        59
           6       1.00      1.00      1.00        45
           7       1.00      0.98      0.99        41
           8       0.97      0.97      0.97        38
 

In [8]:
# load iris the datasets
load_iris_dataset = datasets.load_iris()
# fit a k-nearest neighbor model to the data
X_train, X_test, Y_train, Y_test = train_test_split(load_iris_dataset.data, load_iris_dataset.target, test_size=0.25
                                                    , random_state=0)
model = KNeighborsClassifier()
t0 = time()
model.fit(X_train,Y_train)
print("training time:", round(time()-t0, 5), "s")
print(model)
t1 = time()
predicted = model.predict(X_test)
print("predicting time:", round(time()-t0, 5), "s")
# summarize the fit of the model
print(classification_report(Y_test, predicted))
print(accuracy_score(Y_test, predicted)*100,'%')
print("Confusion Matrix")
print(confusion_matrix(Y_test, predicted))

training time: 0.001 s
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
predicting time: 0.00299 s
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.94      0.97        16
           2       0.90      1.00      0.95         9

   micro avg       0.97      0.97      0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38

97.36842105263158 %
Confusion Matrix
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]


In [10]:
# load iris the datasets
load_wine_dataset = datasets.load_wine()
X_train, X_test, Y_train, Y_test = train_test_split(load_wine_dataset.data, load_wine_dataset.target, test_size=0.25
                                                    , random_state=0)
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
t0 = time()
model.fit(X_train,Y_train)
print("training time:", round(time()-t0, 5), "s")
print(model)
t1 = time()
predicted = model.predict(X_test)
print("predicting time:", round(time()-t0, 5), "s")
# summarize the fit of the model
print(classification_report(Y_test, predicted))
print(accuracy_score(Y_test, predicted)*100,'%')
print(confusion_matrix(Y_test, predicted))

training time: 0.001 s
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
predicting time: 0.00299 s
              precision    recall  f1-score   support

           0       0.82      0.88      0.85        16
           1       0.81      0.81      0.81        21
           2       0.29      0.25      0.27         8

   micro avg       0.73      0.73      0.73        45
   macro avg       0.64      0.64      0.64        45
weighted avg       0.72      0.73      0.73        45

73.33333333333333 %
[[14  1  1]
 [ 0 17  4]
 [ 3  3  2]]


In [11]:
# load iris the datasets
dataset_breast_cancer = datasets.load_breast_cancer()
X_train, X_test, Y_train, Y_test = train_test_split(dataset_breast_cancer.data, dataset_breast_cancer.target, test_size=0.25
                                                    , random_state=0)
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
t0 = time()
model.fit(X_train,Y_train)
print("training time:", round(time()-t0, 5), "s")
print(model)
# make predictions
t1 = time()
predicted = model.predict(X_test)
print("predicting time:", round(time()-t0, 5), "s")
# summarize the fit of the model
print(classification_report(Y_test, predicted))
print(accuracy_score(Y_test, predicted)*100,'%')
print(confusion_matrix(Y_test, predicted))

training time: 0.001 s
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
predicting time: 0.00399 s
              precision    recall  f1-score   support

           0       0.91      0.92      0.92        53
           1       0.96      0.94      0.95        90

   micro avg       0.94      0.94      0.94       143
   macro avg       0.93      0.93      0.93       143
weighted avg       0.94      0.94      0.94       143

93.7062937062937 %
[[49  4]
 [ 5 85]]
