In [1]:
#
# 6/2/2021
# University of Oregon
# CIS 472: Machin Learning
# Instructor: Thien Huu Nguyen
#
# Final Project
# Part 2: image classification on [mnist_fashion] dataset,
# usging [KNN]
# Submit: Warren Liu
# 

In [2]:
from tensorflow import keras
import numpy as np
from tqdm import tqdm
import time

In [3]:
k_num = [5, 7, 9, 11]

In [4]:
# Load data and rescale
def load_data():
    fashion_mnist = keras.datasets.fashion_mnist
    (train_x, train_y), (test_x, test_y) = fashion_mnist.load_data()
    train_x, test_x = train_x / 255.0, test_x / 255.0
    #print(f'Train data shape: {train_x.shape}, Train lable shape: {train_y.shape}')
    #print(f'Test data shape: {test_x.shape}, Test lable shape: {test_y.shape}')
    return (train_x, train_y), (test_x, test_y)

In [5]:
# Euclidean disttence = {sum[(x1 - y1)^2 + (x2 - y2)^2 + ... + (xn - yn)^2]} ^ (0.5)
def euclidean_dis(image1, image2):
    return np.sum((image1 - image2) ** 2) ** (0.5)

In [6]:
# L0 = sum of non-zero elements
# Input here is modified, assign 0 to 0 index, assign 1 to non-zero index
def l0_dis(image1, image2):
    image1_1 = np.copy(image1)
    image2_1 = np.copy(image2)
    image1_1[image1 > 0] = 1
    image2_1[image2 >  0] = 1
    return np.sum((image1_1 - image2_1) ** 2) ** (0.5)

In [7]:
def main():
   # Load data
    (train_image, train_labe), (test_image, test_lable) = load_data()
    num_train = np.shape(train_labe)[0]
    # 10,000 is too much, training process needs 8 hours, do 100 instead
    num_test = np.shape(test_lable)[0] // 100 
    #print(num_train, num_test)
    distance_fn = [euclidean_dis, l0_dis]
    
    for k in k_num:
        K = k
        for fn in distance_fn:
            dis_fn = fn
            start_time = time.time()

            knn_lable = np.array([])
            num_error = 0

            for i in tqdm(range(num_test), desc=dis_fn.__name__):
                dis_arr = np.array([])
                # For each image in train dataset do
                for j in range(num_train):
                    #calculate distance
                    distance = dis_fn(train_image[j], test_image[i])
                    dis_arr = np.append(dis_arr, distance)

                # Get the lable
                s_idx = np.argsort(dis_arr)
                k_lable = train_labe[s_idx[:K]]
                (val, cnt) = np.unique(k_lable, return_counts=True)
                knn_lable = np.append(knn_lable, val[np.argmax(cnt)])
                if knn_lable[-1] != test_lable[i]: num_error += 1
            
            time_used = time.time() - start_time
            template = '{}, k={} - Loss: {}, Accuracy: {}%, Time: {:.2f}s'
            print(template.format(fn.__name__, 
                                    k,
                                    num_error,
                                    ((num_test - num_error) / num_test * 100.0),
                                    time_used))


In [8]:
if __name__ == '__main__':
    main()

euclidean_dis: 100%|█████████████████████████████████████████████████████████████████| 100/100 [02:05<00:00,  1.26s/it]
l0_dis:   0%|                                                                                  | 0/100 [00:00<?, ?it/s]

euclidean_dis, k=5 - Loss: 13, Accuracy: 87.0%, Time: 125.83s


l0_dis: 100%|████████████████████████████████████████████████████████████████████████| 100/100 [03:11<00:00,  1.91s/it]
euclidean_dis:   0%|                                                                           | 0/100 [00:00<?, ?it/s]

l0_dis, k=5 - Loss: 16, Accuracy: 84.0%, Time: 191.47s


euclidean_dis: 100%|█████████████████████████████████████████████████████████████████| 100/100 [02:01<00:00,  1.21s/it]
l0_dis:   0%|                                                                                  | 0/100 [00:00<?, ?it/s]

euclidean_dis, k=7 - Loss: 16, Accuracy: 84.0%, Time: 121.04s


l0_dis: 100%|████████████████████████████████████████████████████████████████████████| 100/100 [03:09<00:00,  1.90s/it]
euclidean_dis:   0%|                                                                           | 0/100 [00:00<?, ?it/s]

l0_dis, k=7 - Loss: 13, Accuracy: 87.0%, Time: 189.54s


euclidean_dis: 100%|█████████████████████████████████████████████████████████████████| 100/100 [01:56<00:00,  1.17s/it]
l0_dis:   0%|                                                                                  | 0/100 [00:00<?, ?it/s]

euclidean_dis, k=9 - Loss: 15, Accuracy: 85.0%, Time: 116.68s


l0_dis: 100%|████████████████████████████████████████████████████████████████████████| 100/100 [03:11<00:00,  1.91s/it]
euclidean_dis:   0%|                                                                           | 0/100 [00:00<?, ?it/s]

l0_dis, k=9 - Loss: 15, Accuracy: 85.0%, Time: 191.41s


euclidean_dis: 100%|█████████████████████████████████████████████████████████████████| 100/100 [02:00<00:00,  1.20s/it]
l0_dis:   0%|                                                                                  | 0/100 [00:00<?, ?it/s]

euclidean_dis, k=11 - Loss: 17, Accuracy: 83.0%, Time: 120.02s


l0_dis: 100%|████████████████████████████████████████████████████████████████████████| 100/100 [03:09<00:00,  1.90s/it]

l0_dis, k=11 - Loss: 15, Accuracy: 85.0%, Time: 189.59s



