In [18]:
import pandas as pd
import numpy as np
import plotly.express as px
px.defaults.width = 600
import plotly.graph_objs as go

from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from scipy.ndimage import shift

In [5]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist['data'], mnist['target']
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))

1

In [6]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_scaled, y_train)

parameters = [{'p':[1, 2], 'n_neighbors':[ 5, 7, 9], 'algorithm':['ball_tree', 'kd_tree']}]

In [None]:
knn_search = HalvingGridSearchCV(knn_clf, parameters, scoring='accuracy', cv=3, verbose=2, resource='n_samples', min_resources='exhaust')
knn_search.fit(X_train_scaled, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 6666
max_resources_: 60000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 6666
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=1; total time=  13.0s
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=1; total time=  13.3s
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=1; total time=  13.4s
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=2; total time=  12.8s
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=2; total time=  12.6s
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=2; total time=  12.8s
[CV] END ............algorithm=ball_tree, n_neighbors=7, p=1; total time=  13.0s
[CV] END ............algorithm=ball_tree, n_neighbors=7, p=1; total time=  13.0s
[CV] END ............algorithm=ball_tree, n_neighbors=7, p=1; total time=  13.0s
[CV] EN

HalvingGridSearchCV(cv=3, estimator=KNeighborsClassifier(),
                    param_grid=[{'algorithm': ['ball_tree', 'kd_tree'],
                                 'n_neighbors': [5, 7, 9], 'p': [1, 2]}],
                    scoring='accuracy', verbose=2)

In [None]:
knn_search.best_score_

0.9545121178784545

In [None]:
knn_search.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1}

In [7]:
knn_clf = KNeighborsClassifier(algorithm='ball_tree', n_neighbors=5, p=1)

In [12]:
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

In [13]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [14]:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [15]:
knn_clf.fit(X_train_augmented, y_train_augmented)

KNeighborsClassifier(algorithm='ball_tree', p=1)

In [17]:
y_pred = knn_clf.predict(X_test)


In [19]:
accuracy_score(y_test, y_pred)

0.9687