In [1]:
import numpy as np
import json

###### Q5.1 ######
def compute_distances(Xtrain, X):
    """
    Compute the distance between each test point in X and each training point
    in Xtrain.
    Inputs:
    - Xtrain: A numpy array of shape (num_train, D) containing training data
    - X: A numpy array of shape (num_test, D) containing test data.
    Returns:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    """
    #####################################################
    #                 YOUR CODE HERE                                        #
    dists = np.zeros([X.shape[0], Xtrain.shape[0]])
    for i in range(X.shape[0]):
        for j in range(Xtrain.shape[0]):
            dists[i,j] = np.linalg.norm(X[i,:] - Xtrain[j,:],2)
    #####################################################         
    return dists

###### Q5.2 ######
def predict_labels(k, ytrain, dists):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.
    Inputs:
    - k: The number of nearest neighbors used for prediction.
    - ytrain: A numpy array of shape (num_train,) where ytrain[i] is the label
      of the ith training point.
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.
    Returns:
    - ypred: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i]. 
    """
    #####################################################
    #                 YOUR CODE HERE                                        #
    c = 10 # number of class
    vc = np.zeros([dists.shape[0], c], dtype=int)
    ypred = -np.ones([dists.shape[0]], dtype=int)
    
    # index of distance from smallest to kth 
    knnind = np.argpartition(dists, k, axis = 1)[:,:k]
    for i in range(dists.shape[0]):
        nny = ytrain[knnind[i,:]]
        for j in range(c):
            vc[i,j] = np.sum(nny ==j)
        ypred[i] = vc[i,:].argmax()
    #####################################################
    return ypred

###### Q5.3 ######
def compute_accuracy(y, ypred):
    """
    Compute the accuracy of prediction based on the true labels.
    Inputs:
    - y: A numpy array with of shape (num_test,) where y[i] is the true label
      of the ith test point.
    - ypred: A numpy array with of shape (num_test,) where ypred[i] is the 
      prediction of the ith test point.
    Returns:
    - acc: The accuracy of prediction (scalar).
    """
    #####################################################
    #                 YOUR CODE HERE                                        #
    acc = sum(y == ypred)/len(y)
    #####################################################
    return acc

###### Q5.4 ######
def find_best_k(K, ytrain, dists, yval):
    """
    Find best k according to validation accuracy.
    Inputs:
    - K: A list of ks.
    - ytrain: A numpy array of shape (num_train,) where ytrain[i] is the label
      of the ith training point.
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance between the ith test point and the jth training
      point.
    - yval: A numpy array with of shape (num_val,) where y[i] is the true label
      of the ith validation point.
    Returns:
    - best_k: The k with the highest validation accuracy.
    - validation_accuracy: A list of accuracies of different ks in K.
    """
    
    #####################################################
    #                 YOUR CODE HERE                                        #
    all_acc = np.zeros([len(K)])
    all_ypred = -np.ones([len(K), len(yval)])
    i = 0
    for k in K:
        all_ypred[i,:] = predict_labels(k, ytrain, dists)
        all_acc[i] = compute_accuracy(y, all_ypred[i,:])
        i += 1
    best_k = K[np.argmax(all_acc)]
    validation_accuracy = all_acc.tolist()
    #####################################################
    return best_k, validation_accuracy


"""
NO MODIFICATIONS below this line.
You should only write your code in the above functions.
"""

def data_processing(data):
    train_set, valid_set, test_set = data['train'], data['valid'], data['test']
    Xtrain = train_set[0]
    ytrain = train_set[1]
    Xval = valid_set[0]
    yval = valid_set[1]
    Xtest = test_set[0]
    ytest = test_set[1]
    
    Xtrain = np.array(Xtrain)
    Xval = np.array(Xval)
    Xtest = np.array(Xtest)
    
    ytrain = np.array(ytrain)
    yval = np.array(yval)
    ytest = np.array(ytest)
    
    return Xtrain, ytrain, Xval, yval, Xtest, ytest
    
def main():
    input_file = 'mnist_subset.json'
    output_file = 'knn_output.txt'

    with open(input_file) as json_data:
        data = json.load(json_data)
    
    #==================Compute distance matrix=======================
    K=[1, 3, 5, 7, 9]    
    
    Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing(data)
    
    dists = compute_distances(Xtrain, Xval)
    
    #===============Compute validation accuracy when k=5=============
    k = 5
    ypred = predict_labels(k, ytrain, dists)
    acc = compute_accuracy(yval, ypred)
    print("The validation accuracy is", acc, "when k =", k)
    
    #==========select the best k by using validation set==============
    best_k,validation_accuracy = find_best_k(K, ytrain, dists, yval)

    
    #===============test the performance with your best k=============
    dists = compute_distances(Xtrain, Xtest)
    ypred = predict_labels(best_k, ytrain, dists)
    test_accuracy = compute_accuracy(ytest, ypred)
    
    #====================write your results to file===================
    f=open(output_file, 'w')
    for i in range(len(K)):
        f.write('%d %.3f' % (K[i], validation_accuracy[i])+'\n')
    f.write('%s %.3f' % ('test', test_accuracy))
    f.close()
    
#if __name__ == "__main__":
#    main()


In [2]:
from sklearn.neighbors import KNeighborsClassifier

input_file = 'mnist_subset.json'
output_file = 'knn_output.txt'

with open(input_file) as json_data:
    data = json.load(json_data)
    
#==================Compute distance matrix=======================
K=[1, 3, 5, 7, 9]    
    
Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing(data)

In [5]:
skchek1 = KNeighborsClassifier( n_neighbors=1,  metric='euclidean' )
skchek1.fit(Xtrain, ytrain)
print(skchek1.score(Xval, yval))
print(skchek1.score(Xtest, ytest))

0.943
0.932


In [6]:
skchek3 = KNeighborsClassifier( n_neighbors=3,  metric='euclidean' )
skchek3.fit(Xtrain, ytrain)
print(skchek3.score(Xval, yval))
print(skchek3.score(Xtest, ytest))

0.938
0.932


In [4]:
skchek5 = KNeighborsClassifier( n_neighbors=5,  metric='euclidean' )
skchek5.fit(Xtrain, ytrain)
print(skchek5.score(Xval, yval))
print(skchek5.score(Xtest, ytest))

0.939
0.927


In [7]:
skchek7 = KNeighborsClassifier( n_neighbors=7,  metric='euclidean' )
skchek7.fit(Xtrain, ytrain)
print(skchek7.score(Xval, yval))
print(skchek7.score(Xtest, ytest))

0.928
0.927


In [8]:
skchek9 = KNeighborsClassifier( n_neighbors=9,  metric='euclidean' )
skchek9.fit(Xtrain, ytrain)
print(skchek9.score(Xval, yval))
print(skchek9.score(Xtest, ytest))

0.927
0.928
