In [1]:
"""
Do not change the input and output format.
If our script cannot run your code or the format is improper, your code will not be graded.

The only functions you need to implement in this template is compute_distances, predict_labels, compute_accuracy
and find_best_k.
"""

import numpy as np
import json

###### Q5.1 ######
def compute_distances(Xtrain, X):
    """"
    Compute the distance between each test point in X and each training point
    in Xtrain.
    Inputs:
    - Xtrain: A numpy array of shape (num_train, D) containing training data
    - X: A numpy array of shape (num_test, D) containing test data.
    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    """
    #####################################################
    #
    # initialize array
    dists = np.zeros([X.shape[0], Xtrain.shape[0]])
    
    # calculate distance as l2 norm
    for i in range(X.shape[0]):
        for j in range(Xtrain.shape[0]):
            dists[i,j] = np.linalg.norm(X[i,:] - Xtrain[j,:],2)
    ##################################################### 
    return dists

###### Q5.2 ######
def predict_labels(k, ytrain, dists):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.
    Inputs:
    - k: The number of nearest neighbors used for prediction.
    - ytrain: A numpy array of shape (num_train,) where ytrain[i] is the label
      of the ith training point.
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.
    Returns:
    - ypred: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i]. 
    """
    #####################################################
    #
    # number of class
    c = 10
    
    # initialize array
    vc = np.zeros([dists.shape[0], c], dtype=int) # voting result for each data
    ypred = -np.ones([dists.shape[0]], dtype=int)
    
    # index of distance from the smallest to kth smallest for each data
    # array(num_train, k)
    knnind = np.argpartition(dists, k, axis = 1)[:,:k]
    
    # check for all num_train 
    for i in range(dists.shape[0]):
        # extract k-nearest neighbor in ith train data
        nny = ytrain[knnind[i,:]]
        
        # for all number of class
        for j in range(c):
            vc[i,j] = np.sum(nny == j)
        
        # argmax return the label with the smallest index
        ypred[i] = vc[i,:].argmax()
    #####################################################
    return ypred

###### Q5.3 ######
def compute_accuracy(y, ypred):
    """
    Compute the accuracy of prediction based on the true labels.
    Inputs:
    - y: A numpy array with of shape (num_test,) where y[i] is the true label
      of the ith test point.
    - ypred: A numpy array with of shape (num_test,) where ypred[i] is the 
      prediction of the ith test point.
    Returns:
    - acc: The accuracy of prediction (scalar).
    """
    #####################################################
    acc = sum(y == ypred)/len(y)
    #####################################################
    return acc

###### Q5.4 ######
def find_best_k(K, ytrain, dists, yval):
    """
    Find best k according to validation accuracy.
    Inputs:
    - K: A list of ks.
    - ytrain: A numpy array of shape (num_train,) where ytrain[i] is the label
      of the ith training point.
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    - yval: A numpy array with of shape (num_val,) where y[i] is the true label
      of the ith validation point.
    Returns:
    - best_k: The k with the highest validation accuracy.
    - validation_accuracy: A list of accuracies of different ks in K.
    """
    
    #####################################################
    #
    # initialize array
    all_acc = np.zeros([len(K)])
    all_ypred = -np.ones([len(K), len(yval)])
    i = 0
    
    for k in K:
        all_ypred[i,:] = predict_labels(k, ytrain, dists)
        all_acc[i] = compute_accuracy(yval, all_ypred[i,:])
        i += 1
    best_k = K[np.argmax(all_acc)]
    validation_accuracy = all_acc.tolist()    
    #####################################################
    return best_k, validation_accuracy


"""
NO MODIFICATIONS below this line.
You should only write your code in the above functions.
"""

def data_processing(data):
    train_set, valid_set, test_set = data['train'], data['valid'], data['test']
    Xtrain = train_set[0]
    ytrain = train_set[1]
    Xval = valid_set[0]
    yval = valid_set[1]
    Xtest = test_set[0]
    ytest = test_set[1]
    
    Xtrain = np.array(Xtrain)
    Xval = np.array(Xval)
    Xtest = np.array(Xtest)
    
    ytrain = np.array(ytrain)
    yval = np.array(yval)
    ytest = np.array(ytest)
    
    return Xtrain, ytrain, Xval, yval, Xtest, ytest
    
def main():
    input_file = 'mnist_subset.json'
    output_file = 'knn_output.txt'

    with open(input_file) as json_data:
        data = json.load(json_data)
    
    #==================Compute distance matrix=======================
    K=[1, 3, 5, 7, 9]    
    
    Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing(data)
    
    dists = compute_distances(Xtrain, Xval)
    
    #===============Compute validation accuracy when k=5=============
    k = 5
    ypred = predict_labels(k, ytrain, dists)
    acc = compute_accuracy(yval, ypred)
    print("The validation accuracy is", acc, "when k =", k)
    
    #==========select the best k by using validation set==============
    best_k,validation_accuracy = find_best_k(K, ytrain, dists, yval)

    
    #===============test the performance with your best k=============
    dists = compute_distances(Xtrain, Xtest)
    ypred = predict_labels(best_k, ytrain, dists)
    test_accuracy = compute_accuracy(ytest, ypred)
    
    #====================write your results to file===================
    f=open(output_file, 'w')
    for i in range(len(K)):
        f.write('%d %.3f' % (K[i], validation_accuracy[i])+'\n')
    f.write('%s %.3f' % ('test', test_accuracy))
    f.close()
    
if __name__ == "__main__":
    main()


The validation accuracy is 0.939 when k = 5


In [2]:
	input_file = 'mnist_subset.json'
	output_file = 'knn_output.txt'

	with open(input_file) as json_data:
		data = json.load(json_data)
	
	#==================Compute distance matrix=======================
	K=[1, 3, 5, 7, 9]	
	
	Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing(data)

In [3]:
Xtrain.max

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
Xtrain.shape[0]

5000

In [12]:
Xtrain[0,:]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [13]:
np.linalg.norm([3,4],2)

5.0

In [14]:
np.linalg.norm(Xtrain[0,:] - Xtest[0,:],2)

9.912333933724785

In [56]:
X = Xtest.copy()
dists = np.zeros([X.shape[0], Xtrain.shape[0]])

for i in range(X.shape[0]):
    for j in range(Xtrain.shape[0]):
        dists[i,j] = np.linalg.norm(X[i,:] - Xtrain[j,:],2)


In [61]:
dists

array([[ 9.91233393, 10.29929019, 10.68818172, ..., 12.80331941,
         8.72943314, 10.37099002],
       [10.49087055, 11.80837537, 11.65408461, ..., 10.06020011,
        10.96881608, 10.47685889],
       [10.05031592, 11.73988849, 11.77566323, ..., 13.34624377,
        10.89469162, 11.0938449 ],
       ...,
       [11.22512985, 11.17216728, 11.78412041, ..., 12.77755811,
        11.10235142,  8.38955017],
       [ 9.64048651, 10.26215442, 12.267522  , ..., 12.95168953,
        10.67060025, 10.17638254],
       [ 9.10985071,  8.91970099, 10.70961468, ..., 13.41502253,
        10.28604116, 10.39319312]])

In [59]:
dists.shape

(1000, 5000)

In [52]:
Xtest.shape

(1000, 784)

In [76]:
k = 3
a = np.arange(10)
np.random.shuffle(a)
print(a)
ind = np.argpartition(a, k)[:k]
ind = ind[np.argsort(a[ind])]
print(ind)

[9 5 4 7 6 3 0 8 2 1]
[6 9 8]


In [108]:
k = 3
a = np.arange(10)
np.random.shuffle(a)
b = a.reshape(2,5)
print(b)
ind = np.argpartition(b, k, axis = 1)[:,:k]
print(ind[0])


[[4 0 9 8 2]
 [1 3 6 7 5]]
[1 4 0]


In [104]:
ytrain

array([7, 6, 4, ..., 0, 9, 8])

In [106]:
min(ytrain)

0

In [107]:
dists.shape[0]

1000

In [119]:
    c = 10 # number of class
    #vc = np.zeros(dists.shape[0],c)
    
    # index of distance from smallest to kth 
    knnind = np.argpartition(dists, k, axis = 1)[:,:k]
    i =0
    #for i in range(dists.shape[0]):
    nny = ytrain[knnind[i,:]]
    vc = np.sum(nny == 8)

In [118]:
nny

array([8, 9, 8])

In [121]:
dists

array([[ 9.91233393, 10.29929019, 10.68818172, ..., 12.80331941,
         8.72943314, 10.37099002],
       [10.49087055, 11.80837537, 11.65408461, ..., 10.06020011,
        10.96881608, 10.47685889],
       [10.05031592, 11.73988849, 11.77566323, ..., 13.34624377,
        10.89469162, 11.0938449 ],
       ...,
       [11.22512985, 11.17216728, 11.78412041, ..., 12.77755811,
        11.10235142,  8.38955017],
       [ 9.64048651, 10.26215442, 12.267522  , ..., 12.95168953,
        10.67060025, 10.17638254],
       [ 9.10985071,  8.91970099, 10.70961468, ..., 13.41502253,
        10.28604116, 10.39319312]])

In [135]:
    c = 10 # number of class
    vc = np.zeros([dists.shape[0], c], dtype=int)
    ypred = -np.ones([dists.shape[0]], dtype=int)
    
    # index of distance from smallest to kth 
    knnind = np.argpartition(dists, k, axis = 1)[:,:k]
    for i in range(dists.shape[0]):
        nny = ytrain[knnind[i,:]]
        for j in range(c):
            vc[i,j] = np.sum(nny ==j)
        ypred[i] = vc[i,:].argmax()
        
        

In [136]:
ypred

array([8, 0, 3, 9, 3, 3, 3, 6, 9, 3, 0, 0, 3, 9, 4, 5, 0, 3, 8, 8, 1, 3,
       8, 1, 0, 4, 9, 0, 8, 0, 1, 4, 8, 7, 8, 2, 1, 3, 9, 5, 9, 4, 3, 6,
       0, 6, 7, 4, 3, 3, 3, 6, 3, 5, 3, 6, 3, 3, 0, 8, 7, 9, 6, 0, 6, 1,
       8, 9, 1, 1, 5, 6, 2, 1, 9, 5, 7, 4, 7, 7, 0, 2, 1, 4, 1, 4, 4, 9,
       0, 1, 8, 3, 9, 8, 7, 1, 1, 8, 1, 9, 8, 6, 2, 5, 8, 1, 3, 9, 8, 1,
       1, 5, 6, 3, 6, 4, 8, 3, 1, 6, 8, 7, 5, 5, 2, 1, 0, 8, 6, 7, 4, 3,
       9, 3, 1, 3, 2, 6, 8, 3, 4, 9, 3, 0, 2, 9, 9, 5, 6, 6, 7, 4, 4, 6,
       8, 1, 1, 7, 0, 1, 1, 0, 7, 2, 0, 0, 6, 5, 2, 3, 4, 9, 4, 6, 7, 4,
       4, 5, 6, 7, 0, 2, 1, 7, 4, 4, 6, 7, 4, 4, 7, 0, 4, 9, 5, 7, 1, 9,
       3, 6, 4, 6, 1, 5, 5, 6, 1, 7, 9, 6, 5, 5, 1, 1, 2, 8, 5, 6, 2, 1,
       0, 6, 0, 7, 6, 0, 6, 6, 3, 0, 7, 5, 9, 9, 2, 9, 1, 5, 5, 2, 1, 4,
       9, 9, 6, 3, 6, 3, 0, 7, 0, 7, 5, 5, 7, 7, 9, 5, 6, 8, 9, 0, 5, 0,
       4, 1, 5, 7, 9, 5, 1, 6, 3, 1, 8, 1, 9, 2, 9, 1, 8, 7, 1, 0, 4, 9,
       9, 1, 3, 4, 1, 6, 3, 2, 3, 8, 3, 5, 2, 7, 7,

In [139]:
a = np.array([1,1,1,0])
b = np.array([0,1,0,1])
sum(a == b)/len(a)

0.25

In [158]:
    all_acc = np.zeros([len(K)])
    all_ypred = -np.ones([len(K), len(yval)])
    i = 0
    for k in K:
        all_ypred[i,:] = predict_labels(k, ytrain, dists)
        all_acc[i] = compute_accuracy(yval, all_ypred[i,:])
        i += 1
    best_k = K[np.argmin(all_acc)]
    validation_accuracy = all_acc.tolist()

In [159]:
all_acc

array([0.265, 0.262, 0.26 , 0.261, 0.262])