# Confirm provided solution

In [1]:
import numpy as np
import json
from itertools import groupby
#===========================Q4.1=====================================
def compute_distances(Xtrain,X):
    num_test = X.shape[0]
    num_train = Xtrain.shape[0]
    dists = np.zeros((num_test, num_train)) 
    
    # |A-B|^2 = |A|^2 +|B|^2 - 2(A*B)
    # numpy.array(D=1).T is not changed (A.T = A = ([0,1,2,3]))
    # A.reshape(A.shape[0],1) = ([[0],[1],[2],[3]]) 
    A = np.sum(X**2,axis=1).reshape(num_test,1)
    B = np.sum(Xtrain**2,axis=1).reshape(num_train,1)
    AB = np.dot(X,np.transpose(Xtrain))
    dists = np.sqrt(-2*AB + A + np.transpose(B))
    return dists

#==========================Q4.2======================================
def predict_labels(k,ytrain,dists):
    
    num_test = dists.shape[0]
    ypred = np.zeros(num_test)
    for i in range(num_test):
        # Returns the indices that would sort an array.
        sort_list = np.argsort(dists[i])    
        closest_y = []
        for j in range(k):
            closest_y.append(ytrain[sort_list[j]])
        
        # sort for using groupby 
        closest_y.sort()
        
        # each index of 'count' has number of label that is in 'key' 
        count = [len(list(group)) for key, group in groupby(closest_y)]
        key = [key for key, group in groupby(closest_y)]
        
        # max is prediction
        max_val = max(count)
        for temp in range(len(count)):
            if count[temp] == max_val:
                ypred[i] = key[temp]
                break
        
    #print(ypred)
    return ypred

#===========================Q4.3=====================================
def compute_accuracy(y,ypred):
    cnt = 0
    for i in range(len(ypred)):
        if y[i] == ypred[i]:
            cnt+=1
    acc = float(cnt)/len(ypred)
    return acc

#==========================Q4.4=====================================
def find_best_k(K,ytrain,dists,yval):
    validation_accuracy = []
    for k in K:
        ypred = predict_labels(k,ytrain,dists)
        acc = compute_accuracy(yval,ypred)
        validation_accuracy.append(acc)
        print("The validation accuracy is",acc,"when k =",k)
    a = max(validation_accuracy)
    idx = validation_accuracy.index(a)
    best_k = K[idx]
    print(best_k)
    return best_k,validation_accuracy



#============================END=====================================

'''
Please DO NOT CHANGE ANY CODE below this line.
You should only write your code in the above functions.
'''

def data_processing(data):
    train_set, valid_set,test_set = data['train'],data['valid'],data['test']
    Xtrain = train_set[0]
    ytrain = train_set[1]
    Xval = valid_set[0]
    yval = valid_set[1]
    Xtest = test_set[0]
    ytest = test_set[1]
    
    Xtrain = np.array(Xtrain)
    Xval = np.array(Xval)
    Xtest = np.array(Xtest)
    
    ytrain = np.array(ytrain)
    yval = np.array(yval)
    ytest = np.array(ytest)
    
    return Xtrain,ytrain,Xval,yval,Xtest,ytest
    
def main():
    input_file = 'mnist_subset.json'
    output_file = 'knn_output.txt'

    with open(input_file) as json_data:
        data = json.load(json_data)
    
    #==================Compute distance matrix=======================
    K=[1,3,5,7,9]    
    
    Xtrain,ytrain,Xval,yval,Xtest,ytest = data_processing(data)
    
    dists = compute_distances(Xtrain,Xval)
    
    #===============Compute validation accuracy when k=5=============
    ypred = predict_labels(5,ytrain,dists)
    acc = compute_accuracy(yval,ypred)
    
    
    #==========select the best k by using validation set==============
    best_k,validation_accuracy = find_best_k(K,ytrain,dists,yval)

    
    #===============test the performance with your best k=============
    dists = compute_distances(Xtrain,Xtest)
    ypred = predict_labels(best_k,ytrain,dists)
    test_accuracy = compute_accuracy(ytest,ypred)
    
    #====================write your results to file===================
    f=open(output_file,'w')
    for i in range(len(K)):
        f.write('%d %.3f' % (K[i], validation_accuracy[i])+'\n')
    f.write('%s %.3f' % ('test', test_accuracy))
    f.close()
    
#if __name__ == "__main__":
#    main()

In [2]:
    input_file = 'mnist_subset.json'
    output_file = 'knn_output.txt'

    with open(input_file) as json_data:
        data = json.load(json_data)
    
    #==================Compute distance matrix=======================
    K=[1,3,5,7,9]    
    
    Xtrain,ytrain,Xval,yval,Xtest,ytest = data_processing(data)

In [3]:
a = np.arange(20).reshape(10,2)
print(a)

[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]


In [25]:
b = np.arange(6)
np.random.shuffle(b)
b = b.reshape(3,2)
print(b)

[[1 3]
 [5 0]
 [4 2]]


In [26]:
A = np.sum(a**2, axis=1)
print("A is")
print(A)
B = np.sum(b**2, axis=1)
print("B is")
print(B)
AB = np.dot(a,np.transpose(b))
print("AB is")
print(AB)

A is
[  1  13  41  85 145 221 313 421 545 685]
B is
[10 25 20]
AB is
[[  3   0   2]
 [ 11  10  14]
 [ 19  20  26]
 [ 27  30  38]
 [ 35  40  50]
 [ 43  50  62]
 [ 51  60  74]
 [ 59  70  86]
 [ 67  80  98]
 [ 75  90 110]]


In [37]:
-2*AB + A.reshape(10,1) + B

array([[  5,  26,  17],
       [  1,  18,   5],
       [ 13,  26,   9],
       [ 41,  50,  29],
       [ 85,  90,  65],
       [145, 146, 117],
       [221, 218, 185],
       [313, 306, 269],
       [421, 410, 369],
       [545, 530, 485]])

In [39]:
-2*AB + A.reshape(10,1) + np.transpose(B.reshape(3,1))

array([[  5,  26,  17],
       [  1,  18,   5],
       [ 13,  26,   9],
       [ 41,  50,  29],
       [ 85,  90,  65],
       [145, 146, 117],
       [221, 218, 185],
       [313, 306, 269],
       [421, 410, 369],
       [545, 530, 485]])

In [32]:
np.transpose(A)

array([  1,  13,  41,  85, 145, 221, 313, 421, 545, 685])

In [33]:
A

array([  1,  13,  41,  85, 145, 221, 313, 421, 545, 685])

In [34]:
A.reshape(10,1)

array([[  1],
       [ 13],
       [ 41],
       [ 85],
       [145],
       [221],
       [313],
       [421],
       [545],
       [685]])

In [35]:
B

array([10, 25, 20])

In [36]:
B.reshape(3,1)

array([[10],
       [25],
       [20]])

In [38]:
np.transpose(B.reshape(3,1))

array([[10, 25, 20]])

In [3]:
dists = compute_distances(Xtrain,Xval)

In [4]:
dists.shape[0]

1000

In [5]:
np.argsort(dists[1]) 

array([3372, 3502, 4817, ..., 2794, 1863, 2772])

In [8]:
np.argsort(dists[1])[0:5]

array([3372, 3502, 4817, 4988, 1409])

In [9]:
ytrain[np.argsort(dists[1])[0:5]]

array([0, 0, 0, 0, 0])

In [10]:
sort_list = np.argsort(dists[1])
closest_y = []
for j in range(5):
    closest_y.append(ytrain[sort_list[j]])
closest_y

[0, 0, 0, 0, 0]

In [12]:
count = [len(list(group)) for key, group in groupby(closest_y)]
key = [key for key, group in groupby(closest_y)]
print(count)
print(key)

[5]
[0]


In [14]:
aa = np.array([0,0,0,1,2,3])
count = [len(list(group)) for key, group in groupby(aa)]
key = [key for key, group in groupby(aa)]
print(count)
print(key)

[3, 1, 1, 1]
[0, 1, 2, 3]


In [23]:
bb = np.array([5,5,0,1,1,2,3,0])
for key, group in groupby(bb):
    print(key)
    print(list(group))

5
[5, 5]
0
[0]
1
[1, 1]
2
[2]
3
[3]
0
[0]


In [None]:


num_test = dists.shape[0]
ypred = np.zeros(num_test)
for i in range(num_test):
    sort_list = np.argsort(dists[i])    
    closest_y = []
    for j in range(k):
        closest_y.append(ytrain[sort_list[j]])
    closest_y.sort()    
    count = [len(list(group)) for key, group in groupby(closest_y)]
    key = [key for key, group in groupby(closest_y)]
        
    max_val = max(count)
    for temp in range(len(count)):
        if count[temp] == max_val:
            ypred[i] = key[temp]
            break