In [1]:
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from collections import Counter

In [2]:
def eval_model(model, test_data, test_labels):
    predictions = model.predict(test_data)
    #print(predictions)
    predictions = list(np.around(np.array(predictions),0))
    accuracy = accuracy_score(test_labels, predictions)
    #print(accuracy)
    return accuracy

In [3]:
def other_class(n_classes, current_class):
    """
    Returns a list of class indices excluding the class indexed by class_ind
    :param nb_classes: number of classes in the task
    :param class_ind: the class index to be omitted
    :return: one random class that != class_ind
    """
    #print(current_class)
    if current_class < 0 or current_class >= n_classes:
        error_str = "class_ind must be within the range (0, nb_classes - 1)"
        raise ValueError(error_str)

    other_class_list = list(range(n_classes))
    other_class_list.remove(current_class)
    other_class = np.random.choice(other_class_list)
    return other_class

def inject_noise(n_classes, y_, noise_level):
    y = y_.copy()
    if noise_level > 100 or noise_level < 0:
        raise ValueError('Noise level can not be bigger than 100 or smaller than 0')

    noisy_idx = np.random.choice(len(y), int(len(y)*noise_level/100.0), replace = False)
    for i in noisy_idx:
        y[i] = other_class(n_classes, y[i])

    return y

In [4]:
# Load testing data
dataset = "tasks-quarter-original"
test_labels = np.loadtxt("data/" + dataset + "-test-labels.txt")[0:6000]
test_data = np.loadtxt("data/" + dataset + "-test-data.txt")[0:6000]
# Load training data
train_labels = np.loadtxt("data/" + dataset + "-train-labels.txt")
train_data = np.loadtxt("data/" + dataset + "-train-data.txt")
n_classes = len(list(set(list(train_labels))))

In [5]:
# this preprocessing step is needed because original task data classes are [2,3,4,5], this steps to make the classes to [0,1,2,3]
for i in range(train_labels.shape[0]):
    train_labels[i] = train_labels[i] - 2
for j in range(test_labels.shape[0]):
    test_labels[j] = test_labels[j] - 2

In [6]:
print(Counter(list(train_labels)).keys()) # equals to list(set(words))
print(Counter(list(train_labels)).values()) # counts the elements' frequency

[0.0, 1.0, 2.0, 3.0]
[3, 39, 10794, 3164]


In [7]:
print(Counter(list(test_labels)).keys()) # equals to list(set(words))
print(Counter(list(test_labels)).values()) # counts the elements' frequency

[1.0, 2.0, 3.0]
[10, 4675, 1315]


KNN task test, 10 time averaged results are showed in the end of output

In [12]:
# KNN 
knnmodel = KNeighborsClassifier(n_neighbors=4, weights = 'distance')
acc_list_knn = []
repeat = 10
for i in range(repeat):
    for noise in [0,10,20,30,40,50,60,70,80,90,100]:
        train_noisy_labels = inject_noise(n_classes, train_labels, noise)
        knnmodel.fit(train_data, train_noisy_labels)
        acc = eval_model(knnmodel, test_data, test_labels)
        acc_list_knn.append(acc)
        print("noise_level, accuracy", noise, acc)
avg_result = np.zeros(11)
for i in range(repeat):
    for j in range(11):
        avg_result[j] += acc_list_knn[j+i*11]
average = avg_result/repeat
print(average)

('noise_level, accuracy', 0, 0.8621666666666666)
('noise_level, accuracy', 10, 0.8115)
('noise_level, accuracy', 20, 0.7451666666666666)
('noise_level, accuracy', 30, 0.6673333333333333)
('noise_level, accuracy', 40, 0.5836666666666667)
('noise_level, accuracy', 50, 0.47683333333333333)
('noise_level, accuracy', 60, 0.3988333333333333)
('noise_level, accuracy', 70, 0.29283333333333333)
('noise_level, accuracy', 80, 0.2055)
('noise_level, accuracy', 90, 0.11383333333333333)
('noise_level, accuracy', 100, 0.04533333333333334)
('noise_level, accuracy', 0, 0.8621666666666666)
('noise_level, accuracy', 10, 0.8065)
('noise_level, accuracy', 20, 0.7423333333333333)
('noise_level, accuracy', 30, 0.6643333333333333)
('noise_level, accuracy', 40, 0.5876666666666667)
('noise_level, accuracy', 50, 0.49333333333333335)
('noise_level, accuracy', 60, 0.3983333333333333)
('noise_level, accuracy', 70, 0.30516666666666664)
('noise_level, accuracy', 80, 0.19966666666666666)
('noise_level, accuracy', 90, 

MLP task test, 10 time averaged results are showed in the end of output

In [13]:
# MLP task
mlpmodel = MLPClassifier(solver='adam', hidden_layer_sizes=(28,28), random_state=1)
acc_list_mlp = []
repeat = 10
for i in range(repeat):
    for noise in [0,10,20,30,40,50,60,70,80,90,100]:
        train_noisy_labels = inject_noise(n_classes, train_labels, noise)
        mlpmodel.fit(train_data, train_noisy_labels)
        acc = eval_model(mlpmodel, test_data, test_labels)
        acc_list_mlp.append(acc)
        print("noise_level, accuracy", noise, acc)
avg_result = np.zeros(11)
for i in range(repeat):
    for j in range(11):
        avg_result[j] += acc_list_mlp[j+i*11]
average = avg_result/repeat
print(average)



('noise_level, accuracy', 0, 0.8305)
('noise_level, accuracy', 10, 0.829)
('noise_level, accuracy', 20, 0.824)
('noise_level, accuracy', 30, 0.816)
('noise_level, accuracy', 40, 0.7866666666666666)
('noise_level, accuracy', 50, 0.768)
('noise_level, accuracy', 60, 0.6741666666666667)
('noise_level, accuracy', 70, 0.42283333333333334)
('noise_level, accuracy', 80, 0.14016666666666666)
('noise_level, accuracy', 90, 0.057666666666666665)
('noise_level, accuracy', 100, 0.030166666666666668)
('noise_level, accuracy', 0, 0.8305)
('noise_level, accuracy', 10, 0.8293333333333334)
('noise_level, accuracy', 20, 0.8193333333333334)
('noise_level, accuracy', 30, 0.8036666666666666)
('noise_level, accuracy', 40, 0.796)
('noise_level, accuracy', 50, 0.741)
('noise_level, accuracy', 60, 0.6738333333333333)
('noise_level, accuracy', 70, 0.4651666666666667)
('noise_level, accuracy', 80, 0.1385)
('noise_level, accuracy', 90, 0.027833333333333335)
('noise_level, accuracy', 100, 0.021)
('noise_level, accu

NearestCentroid task test, 10 time averaged results are showed in the end of output

In [33]:
# NearestCentroid task
ncmodel = NearestCentroid()
acc_list_nc = []
repeat = 10
for i in range(repeat):
    for noise in [0,10,20,30,40,50,60,70,80,90,100]:
        train_noisy_labels = inject_noise(n_classes, train_labels, noise)
        ncmodel.fit(train_data, train_noisy_labels)
        acc = eval_model(ncmodel, test_data, test_labels)
        acc_list_nc.append(acc)
        print("noise_level, accuracy", noise, acc)
avg_result = np.zeros(11)
for i in range(repeat):
    for j in range(11):
        avg_result[j] += acc_list_nc[j+i*11]
average = avg_result/repeat
print(average)

('noise_level, accuracy', 0, 0.533)
('noise_level, accuracy', 10, 0.45666666666666667)
('noise_level, accuracy', 20, 0.461)
('noise_level, accuracy', 30, 0.4176666666666667)
('noise_level, accuracy', 40, 0.4315)
('noise_level, accuracy', 50, 0.4146666666666667)
('noise_level, accuracy', 60, 0.23416666666666666)
('noise_level, accuracy', 70, 0.255)
('noise_level, accuracy', 80, 0.19083333333333333)
('noise_level, accuracy', 90, 0.2796666666666667)
('noise_level, accuracy', 100, 0.30416666666666664)
('noise_level, accuracy', 0, 0.533)
('noise_level, accuracy', 10, 0.4523333333333333)
('noise_level, accuracy', 20, 0.391)
('noise_level, accuracy', 30, 0.3476666666666667)
('noise_level, accuracy', 40, 0.3486666666666667)
('noise_level, accuracy', 50, 0.3428333333333333)
('noise_level, accuracy', 60, 0.1945)
('noise_level, accuracy', 70, 0.24333333333333335)
('noise_level, accuracy', 80, 0.179)
('noise_level, accuracy', 90, 0.1435)
('noise_level, accuracy', 100, 0.25783333333333336)
('noise_