In [1]:
import gzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from collections import Counter

# Reading the dataset :

In [2]:
def read_images(filename):    
    with gzip.open(filename, 'r') as f:
        magic_number = int.from_bytes(f.read(4), 'big')
        image_count = int.from_bytes(f.read(4), 'big')
        row_count = int.from_bytes(f.read(4), 'big')
        column_count = int.from_bytes(f.read(4), 'big')
        image_data = f.read()
        images = np.frombuffer(image_data, dtype=np.uint8).reshape((image_count, row_count, column_count))
        
    return images


def read_labels(filename):
    with gzip.open(filename, 'r') as f:
        magic_number = int.from_bytes(f.read(4), 'big')
        label_count = int.from_bytes(f.read(4), 'big')
        label_data = f.read()
        labels = np.frombuffer(label_data, dtype=np.uint8)
        
    return labels


dataset_path = "D:/RKMVERI MSc BDA/SEM 2/ML/"

train_image_filename = ''.join([dataset_path, 'train-images-idx3-ubyte.gz'])
train_label_filename = ''.join([dataset_path, 'train-labels-idx1-ubyte.gz'])

test_image_filename = ''.join([dataset_path, 't10k-images-idx3-ubyte.gz'])
test_label_filename = ''.join([dataset_path, 't10k-labels-idx1-ubyte.gz'])

train_images = read_images(train_image_filename)
train_labels = read_labels(train_label_filename)

test_images = read_images(test_image_filename)
test_labels = read_labels(test_label_filename)

# Extracting 20% data from the training data as a validation set :

In [3]:
index = np.random.permutation(range(60000))
validation_index = index[0:12000]
training_index = index[12000:]
validation_images = train_images[validation_index]
validation_labels = train_labels[validation_index]
training_images = train_images[training_index]
training_labels = train_labels[training_index]

# Implement k-NN classification rule/algorithm

In [4]:
training_data = [i.flatten() for i in training_images]
validation_set = [i.flatten() for i in validation_images]

In [5]:
## Making a dataframe using the data in training set
training_df = pd.DataFrame(training_data)
training_df.columns = ["pixel"+str(i) for i in list(range(1,785))]
validation_df = pd.DataFrame(validation_set)
validation_df.columns = ["pixel"+str(i) for i in list(range(1,785))]

In [6]:
## Showing the training dataset with actual label column
df1 = training_df.copy()
training_df["labels"] = training_labels
display(df1)

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
##Instance---

def train(x_train, y_train):
    return

In [8]:
#Defining two kind of distances -  MANHATTAN & EUCLIDIAN & CHEBYSHEV'S

def manhattan_distance(x_train, x_validation):
    distances = []
    for i in range(len(x_train)):
        distance = np.sum(np.abs(x_validation- x_train[i]))
        distances.append([distance,i])
    distances = sorted(distances)
    return distances

def euclidian_distance(x_train, x_validation):
    distances = []
    for i in range(len(x_train)):
        distance = np.sqrt(np.sum(np.square(x_validation - x_train[i])))
        distances.append([distance,i])
    distances = sorted(distances)
    return distances

def chebyshev_distance(x_train, x_validation):
    distances = []
    for i in range(len(x_train)):
        distance = np.max(np.sum(np.abs(x_validation - x_train[i])))
        distances.append([distance,i])
    distances = sorted(distances)
    return distances

In [9]:
#Prediction functions using different distance functions

def predict_Manhattan(x_train, y_train, x_validation, k):
    distances = manhattan_distance(x_train, x_validation)
    label = []
    for i in range(k):
        index = distances[i][1]
        label.append(y_train[index])
    return Counter(label).most_common(1)[0][0]


def predict_Euclidian(x_train, y_train, x_validation, k):
    distances = euclidian_distance(x_train, x_validation)
    label = []
    for i in range(k):
        index = distances[i][1]
        label.append(y_train[index])
    return Counter(label).most_common(1)[0][0]

def predict_Chebyshev(x_train, y_train, x_validation, k):
    distances = manhattan_distance(x_train, x_validation)
    label = []
    for i in range(k):
        index = distances[i][1]
        label.append(y_train[index])
    return Counter(label).most_common(1)[0][0]

In [10]:
"""def output(distances,y_train,k):
    label = []
    for i in range(len(k)):
        index = distances[i][1]
        label.append(y_train[index])
    return Counter(label).most_common(1)[0][0]"""

'def output(distances,y_train,k):\n    label = []\n    for i in range(len(k)):\n        index = distances[i][1]\n        label.append(y_train[index])\n    return Counter(label).most_common(1)[0][0]'

In [11]:
## The main k-NN algorithms--- 

def kNN_Manhattan(x_train,y_train,x_validation, predictions, k):
        train(x_train, y_train)
        for i in range(len(x_validation)):
            predictions.append(predict_Manhattan(x_train, y_train, x_validation[i], k))
            
def kNN_Euclidian(x_train,y_train,x_validation, predictions, k):
        train(x_train, y_train)
        for i in range(len(x_validation)):
            predictions.append(predict_Manhattan(x_train, y_train, x_validation[i], k))
            
def kNN_Chebyshev(x_train,y_train,x_validation, predictions, k):
        train(x_train, y_train)
        for i in range(len(x_validation)):
            predictions.append(predict_Manhattan(x_train, y_train, x_validation[i], k))

In [12]:
'''## Prediction on the validation set---
validation_pred = []
kNN_Manhattan(training_data, training_labels, validation_set, validation_pred, 10)
predictions = np.asarray(validation_pred)'''

'## Prediction on the validation set---\nvalidation_pred = []\nkNN_Manhattan(training_data, training_labels, validation_set, validation_pred, 10)\npredictions = np.asarray(validation_pred)'

In [13]:
l = []
def accuracy(x,y):
    x1 = np.array(x)
    y1 = np.array(y)
    accuracy = (sum(x1==y1)/len(x))*100
    return accuracy       

In [15]:
k = range(1,26,2)
print(list(k))

[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25]


In [16]:
validation_pred = []
for i in range(len(k)):
    pred[i] = kNN_Manhattan(training_data,training_labels,validation_set, validation_pred, k[i])
    accuracy[i] = accuracy(training_labels,pred[i])

KeyboardInterrupt: 