## Import and Preprocess Data

In [None]:
import numpy as np
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
import statistics 

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# Method for reading in the "pickled" object images
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# Preprocessing- convert to greyscale
def rgb2gray(im):
    col_size = len(im[:,0])
    im_out = np.empty([col_size,1024])
    
    for i in range(0,col_size):
        for j in range(0,1024):
            r = im[i,j] 
            g = im[i,j+1024]
            b = im[i,j+2048]
            im_out[i,j] = (0.2989 * r + 0.5870 * g + 0.1140 * b)
    
    return im_out

# Each data_batch is a dictionary with the following items
# b'batch_label --> specifies which batch it is
# b'labels --> array of 10,000 labels 0-9 correspoding to the correct classification
# b'data --> 10,000 x 3072 array of uint8 pixels, each rows is a 32x32 image with the first 1024 entries being the red,
#            the second 1024 entries being the green, and the last 1024 entries being the blue

db1_labels = data_batch_1[b'labels']
db1_data = data_batch_1[b'data']
db2_labels = data_batch_2[b'labels']
db2_data = data_batch_2[b'data']
db3_labels = data_batch_3[b'labels']
db3_data = data_batch_3[b'data']
db4_labels = data_batch_4[b'labels']
db4_data = data_batch_4[b'data']
db5_labels = data_batch_5[b'labels']
db5_data = data_batch_5[b'data']
db6_labels = test_batch[b'labels']
db6_data = test_batch[b'data']
tb_labels = test_batch[b'labels']
tb_data = test_batch[b'data']


db1_data = rgb2gray(db1_data)
db2_data = rgb2gray(db2_data)
db3_data = rgb2gray(db3_data)
db4_data = rgb2gray(db4_data)
db5_data = rgb2gray(db5_data)
db6_data = rgb2gray(db6_data)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

## K-Nearest Neighbors

In [None]:
def k_nearest_neighbors(A, d, T, y, k):
    train_size = len(A[:,0])
    test_size = len(T[:,0])

    distances = []
    test_errors = np.zeros(test_size)
    labels = np.zeros(test_size)
    error_count = 0
    
    for i in range(0, test_size):
        for j in range(0, train_size):
            distances.append((np.linalg.norm(A[j,:]-T[i,:]),j))

#     distances = np.sqrt((T**2).sum(axis=1)[:, np.newaxis] + (A**2).sum(axis=1) - 2 * T.dot(A.T))
    # distances = np.sqrt((T**2).sum(axis=1)[:, np.newaxis] + (self.A**2).sum(axis=1) - 2 * T.dot(self.A.T))

        sort_distances = sorted(distances)

        k_nearest = sort_distances[:k]

        k_labels = []
        for dist, idx in k_nearest:
            k_labels.append(d[idx])

        labels[i] = statistics.mode(k_labels)

        if labels[i] != y[i]:
            error_count += 1
        
    # Calculate the errors and return them
    error_rate = error_count / len(y)
    squared_error = np.linalg.norm(labels - y)**2

    return ([error_rate, squared_error])

## Testing

In [None]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

A = db2_data
d = db2_labels
T = db1_data[0:5000,:]
y = db1_labels[0:5000]
k = 1

print("KNN Iteration 1")
[error_rate1, squared_error1] = k_nearest_neighbors(A, d, T, y, k)

print("Error Rate: " + str(round(error_rate1*100,3)) + ", Sqaured Error: " + str(round(squared_error1,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()

## SKLearn KNN 

In [None]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

A = np.vstack((db2_data, db3_data, db4_data, db5_data, db6_data)) #Training matrix
d = np.column_stack((np.array(db2_labels), np.array(db3_labels), np.array(db4_labels), np.array(db5_labels), np.array(db6_labels))).reshape(50000,1) #Known classifiers
T = db1_data
y = db1_labels

train_size = len(A[:,0])
test_size = len(T[:,0])

knn = KNeighborsClassifier(n_neighbors=3,algorithm='ball_tree',p=2)
# knn = KNeighborsClassifier(n_neighbors=3,algorithm='kd_tree',p=2)
knn.fit(A, d)

error_count = 0
labels = np.zeros(test_size)

for i in range(0,test_size):
    test = T[i,:].reshape((1,-1))
    y_hat = knn.predict(test)
    labels[i] = y_hat
#     y_hat = knn.predict(T[i,:])
    
    if y_hat != y[i]:
        error_count += 1
    
    
error_rate = error_count / test_size
squared_error = np.linalg.norm(labels - y)**2

for i in range(0,20):
    print(labels[i])


print("Error Rate: " + str(round(error_rate*100,3)) + ", Sqaured Error: " + str(round(squared_error,3)))
print()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
print()
print()