## Import and Preprocess Data

In [None]:
import numpy as np
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
import statistics 

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# Method for reading in the "pickled" object images
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def preprocess(x):
    min_val = np.min(x)
    max_val = np.max(x)
    x = (x-min_val) / (max_val-min_val)
    return x

# Read in the datasets 5 training batches and 1 test batch, each has 10,000 images
data_batch_1 = unpickle('data_batch_1')
data_batch_2 = unpickle('data_batch_2')
data_batch_3 = unpickle('data_batch_3')
data_batch_4 = unpickle('data_batch_4')
data_batch_5 = unpickle('data_batch_5')
data_batch_6 = unpickle('test_batch')

# Each data_batch is a dictionary with the following items
# b'batch_label --> specifies which batch it is
# b'labels --> array of 10,000 labels 0-9 correspoding to the correct classification
# b'data --> 10,000 x 3072 array of uint8 pixels, each rows is a 32x32 image with the first 1024 entries being the red,
#            the second 1024 entries being the green, and the last 1024 entries being the blue

#Read in the batch data and perform pre-processing
db1_labels = data_batch_1[b'labels']
db1_data = data_batch_1[b'data'].reshape((len(data_batch_1[b'data']), 3, 32, 32)).transpose(0, 2, 3, 1)
db2_labels = data_batch_2[b'labels']
db2_data = data_batch_2[b'data'].reshape((len(data_batch_2[b'data']), 3, 32, 32)).transpose(0, 2, 3, 1)
db3_labels = data_batch_3[b'labels']
db3_data = data_batch_3[b'data'].reshape((len(data_batch_3[b'data']), 3, 32, 32)).transpose(0, 2, 3, 1)
db4_labels = data_batch_4[b'labels']
db4_data = data_batch_4[b'data'].reshape((len(data_batch_4[b'data']), 3, 32, 32)).transpose(0, 2, 3, 1)
db5_labels = data_batch_5[b'labels']
db5_data = data_batch_5[b'data'].reshape((len(data_batch_5[b'data']), 3, 32, 32)).transpose(0, 2, 3, 1)
db6_labels = data_batch_6[b'labels']
db6_data = data_batch_6[b'data'].reshape((len(data_batch_6[b'data']), 3, 32, 32)).transpose(0, 2, 3, 1)


# Each new image has the form width x height x num_chanels (RGB) = 32 x 32 x 3
# 10,000 images in each data segment so 10000 x 32 x 32 x 3

print(len(db1_data[:,0,0,0])) # col size
print(len(db1_data[0,:,0,0])) # row size
print(len(db1_data[0,0,:,0])) # row size
print(len(db1_data[0,0,0,:])) # row size

# print(len(db1_labels)) # col size

for i in range(0,10000):
    db1_data[i,:,:,:] = preprocess(db1_data[i,:,:,:])
    db2_data[i,:,:,:] = preprocess(db2_data[i,:,:,:])
    db3_data[i,:,:,:] = preprocess(db3_data[i,:,:,:])
    db4_data[i,:,:,:] = preprocess(db4_data[i,:,:,:])
    db5_data[i,:,:,:] = preprocess(db5_data[i,:,:,:])
    db6_data[i,:,:,:] = preprocess(db6_data[i,:,:,:])


now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)