In [42]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

In [43]:
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
print('before reshaping:', digits.images.shape)

X = digits.images.reshape((n_samples, -1))  # -1 means the value is inferred from the length of the array and remaining dimensions
y = digits.target

print('after reshaping:', X.shape)
print(X)
print(y)


before reshaping: (1797, 8, 8)
after reshaping: (1797, 64)
[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
[0 1 2 ... 8 9 8]


In [44]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)  #test_size represents the test proportion; random_state controls shuffling before split
print(len(X_train))
print(len(X_test))

1347
450


In [45]:
clf = KNeighborsClassifier(n_neighbors=6)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

print(confusion_matrix(y_true, y_pred))
print('accuracy =', accuracy_score(y_true, y_pred))
print('f1_score =', metrics.f1_score(y_true, y_pred, average='macro'))
print(classification_report(y_true, y_pred))

[[37  0  0  0  0  0  0  0  0  0]
 [ 0 43  0  0  0  0  0  0  0  0]
 [ 0  0 43  0  0  0  0  1  0  0]
 [ 0  0  0 45  0  0  0  0  0  0]
 [ 0  0  0  0 37  0  0  1  0  0]
 [ 0  0  0  0  0 47  0  0  0  1]
 [ 0  0  0  0  0  0 52  0  0  0]
 [ 0  0  0  0  0  0  0 48  0  0]
 [ 0  4  0  2  0  0  0  1 41  0]
 [ 0  0  0  0  0  1  0  0  0 46]]
accuracy = 0.9755555555555555
f1_score = 0.9757924194139573
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.91      1.00      0.96        43
           2       1.00      0.98      0.99        44
           3       0.96      1.00      0.98        45
           4       1.00      0.97      0.99        38
           5       0.98      0.98      0.98        48
           6       1.00      1.00      1.00        52
           7       0.94      1.00      0.97        48
           8       1.00      0.85      0.92        48
           9       0.98      0.98      0.98        47

    accuracy 

In [46]:
clf = GridSearchCV(
        KNeighborsClassifier(), 
        [{'n_neighbors': range(1,10,2), 'metric': ['euclidean','manhattan']}], cv=3, scoring='f1_macro')
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
for mean, params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))
print()
    
print(classification_report(y_true, y_pred))

Best parameters set found on development set:

{'metric': 'euclidean', 'n_neighbors': 1}

Grid scores on development set:

0.983 for {'metric': 'euclidean', 'n_neighbors': 1}
0.981 for {'metric': 'euclidean', 'n_neighbors': 3}
0.980 for {'metric': 'euclidean', 'n_neighbors': 5}
0.980 for {'metric': 'euclidean', 'n_neighbors': 7}
0.977 for {'metric': 'euclidean', 'n_neighbors': 9}
0.975 for {'metric': 'manhattan', 'n_neighbors': 1}
0.976 for {'metric': 'manhattan', 'n_neighbors': 3}
0.975 for {'metric': 'manhattan', 'n_neighbors': 5}
0.970 for {'metric': 'manhattan', 'n_neighbors': 7}
0.964 for {'metric': 'manhattan', 'n_neighbors': 9}

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       1.00      1.00      1.00        43
           2       1.00      0.98      0.99        44
           3       0.96      1.00      0.98        45
           4       1.00      1.00      1.00        38
           5       0.98      0.