In [None]:
"This demos shows how to do a Nearest Neighbor Classifier"
"on the CIFAR-10 dataset."

# command to let jupyter plot things inline
%matplotlib inline

# Numpy is a python library for scientific computing
import numpy as np
# library allowing us to handle serialization in python
import cPickle as pickle
# library that allows us to write shell commands in python
import os
# library to generate plots, works as matlab, but needs to be called in the "plt" alias
import matplotlib.pyplot as plt

In [None]:

xs = []
ys = []

# Load the training data
# we will iterate over the six data batches to lead them
for batch in range(1,6):

    filename = os.path.join('cifar-10-batches-py/data_batch_%d' %(batch, ) )

    with open(filename, 'rb') as f:
        # each file is loaded as a dictionary. Elements in the dictionary
        # are more than what we need. WE only need  data and labels
        datadict = pickle.load(f)
        X = datadict['data']
        Y = datadict['labels']
        # Spoiler alert! We are using numpy. You shall not assume that
        # everything is a numpy array by default. Therefore, turn things
        # into numpy arrays. 
        X = np.array(X)
        Y = np.array(Y)        
    
    xs.append(X)
    ys.append(Y)

    Xtr = np.float_(np.concatenate(xs))
    Ytr = np.concatenate(ys)
    
# now load the test data
filename = 'cifar-10-batches-py/test_batch'
with open (filename, 'rb') as f:
    datadict = pickle.load(f)
    X = datadict['data']
    Y = datadict['labels']
    Xte = np.float_(np.array(X))
    Yte = np.array(Y)
    

In [None]:
# Implementation by Andrej Kaparthy et al.
# We will now implement cross validation 
import KNN

# Initialize the algorithm
algo = KNN.KNN()

num_folds = 5
k_choices = [1, 3, 5, 8 , 10 , 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []

#split the trainind data into folds.
X_train_folds = np.array_split(Xtr, num_folds)
y_train_folds = np.array_split(Ytr, num_folds)

# create dictionary holding all the accuracies 
# for the different values of k that we will try out.
k_to_accuracies = {}
# initialize the dictionary
for k in k_choices:
    k_to_accuracies[k] = []
    
for k in k_choices:
    print 'evaluating k=%d' % k
    for fold in range(num_folds):
        print 'evaluating fold No. %d' % (fold + 1)
        x_train_cv = np.vstack(X_train_folds[0:fold]+X_train_folds[fold+1:])
        x_test_cv = X_train_folds[fold]
        
        num_test = x_test_cv.shape[0]

        #get also the labels
        y_train_cv = np.hstack(y_train_folds[0:fold]+y_train_folds[fold+1:])

        y_test_cv = y_train_folds[fold]

        
        # train the algorithm
        algo.train(x_train_cv, y_train_cv)

        # compute the distance matrix
        dists = algo.compute_distances(x_test_cv, L=1)

        
        # find the predicted labels
        y_pred = algo.predict_labels(dists,k)

        num_correct = np.sum(y_pred == y_test_cv)
        accuracy = float(num_correct) / num_test
        print 'accuracy: %f' % accuracy
        k_to_accuracies[k].append(accuracy)