### Imports


In [None]:
from __future__ import division
import numpy as np
import pandas as pd
import struct, gzip
import matplotlib.pyplot as plt
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "True"


In [None]:
def read_idx(filename):
    with gzip.open(filename, "rb") as f:
        zero, data_type, dims = struct.unpack(">HBB", f.read(4))
        shape = tuple(struct.unpack(">I", f.read(4))[0] for _ in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)


### Analyzing the data


In [None]:
raw_train_images = read_idx(r"train-images-idx3-ubyte.gz")
train_data = np.reshape(raw_train_images, (60000, 28 * 28))
train_labels = read_idx(r"train-labels-idx1-ubyte.gz")

raw_test_images = read_idx(r"t10k-images-idx3-ubyte.gz")
test_data = np.reshape(raw_test_images, (10000, 28 * 28))
test_labels = read_idx(r"t10k-labels-idx1-ubyte.gz")

# no_train = 6000
no_test = 10000
split = 60000  # Train/Test split at location 60000

X_train, y_train = train_data, train_labels
X_test, y_test = test_data, test_labels

# Dataframes for train and test data
df_train = pd.DataFrame(X_train)
df_test = pd.DataFrame(X_test)


In [None]:
def distance(x, y):
    # Euclidean distance
    return np.sqrt(np.sum((x - y) ** 2))


k_values = [1, 3, 5, 10, 20, 30, 40, 50, 60]


def set_predictions(df_t, test, train_predictions):
    distances = []
    index_counter = []
    predictions = [[] for _ in range(len(k_values))]

    loop = no_test if test else split

    for i in range(loop):
        vector_1 = df_t.iloc[i]

        for j in range(split):
            vector = df_test.iloc[j] if test else df_train.iloc[j]
            distances.append(distance(vector_1, vector))
            index_counter.append(j)

    results = {"index": index_counter, "distance": distances}
    df = pd.DataFrame(results, columns=["index", "distance"])
    df_sorted = df.sort_values(by=["distance"])

    for K, k_value in enumerate(k_values):
        index_list = list(df_sorted["index"][:k_value])
        dist = list(df_sorted["distance"][:k_value])
        res_list = (
            [train_predictions[K][i] for i in index_list]
            if test
            else [y_train[i] for i in index_list]
        )
        prediction = max(res_list, key=res_list.count)
        predictions[K].append(prediction)

    return predictions


In [None]:
train_predictions = set_predictions(df_train, False, [])
pred_lists = set_predictions(df_test, True, train_predictions)

prediction = 0
prediction_result = []

for K, k_value in enumerate(k_values):
    for l1, l2 in zip(pred_lists[K], y_test.tolist()):
        if l1 == l2:
            prediction += 1

    accuracy = prediction / no_test
    prediction_result.append((round(accuracy * 100, 2)))
    print(f"The accuracy is {str(accuracy * 100)}% for K={str(k_value)}")
    prediction = 0

### Result plots


In [None]:
df_result = pd.DataFrame()
df_result["K value"] = k_values
df_result["pred"] = prediction_result

plt.plot(df_result["K value"], df_result["pred"], "g", label="pred")
plt.xlabel("K value")
plt.ylabel("Accuracy (%)")
plt.plot(k_values, accuracy, "X-", color="b")
plt.xlabel("k values")
plt.ylabel("Accuracy")
plt.xticks(k_values)
plt.show()
