In [2]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from tqdm import tqdm_notebook as tqdm

In [3]:
data_path = "data/"
gen_data_path = "generated/"
df_tr_file = "DS1_train"
df_test_file = "DS1"
df_val_file = "DS1_val"

In [4]:
df_test = pd.read_csv(gen_data_path + df_test_file, index_col = 'Unnamed: 0')
df_val = pd.read_csv(gen_data_path + df_val_file, index_col = 'Unnamed: 0')
df_tr = pd.read_csv(gen_data_path + df_tr_file, index_col = 'Unnamed: 0')

In [5]:
def get_x_y_values(dataset):
    return dataset.drop('class', axis = 1).values, dataset['class'].values

In [6]:
test_x, test_y = get_x_y_values(df_test)
val_x, val_y = get_x_y_values(df_val)
train_x, train_y = get_x_y_values(df_tr)

In [7]:
def get_dist(a, b):
    return distance.cdist(a, b, metric='euclidean')

def knn(k, xs, ys):
    TN, TP, FN, FP = 0, 0, 0, 0
    distances_indexes = np.array([])
    for x, y in zip(xs, ys):
        d = get_dist([x], train_x).flatten().flatten().argsort()[:k]
        distances_indexes = np.append(distances_indexes, d)
        
        # find the class of val_x based on k-nearest neighbors
        classes = [(train_y[int(i)]) for i in distances_indexes]
        y_pred = int(round(sum(classes)/len(classes)))
        
        if y_pred == y:
            if y_pred:
                TP += 1
            else: 
                TN += 1
        else: 
            if y_pred:
                FP += 1
            else: 
                FN += 1
            
    accuracy = (TN + TP)/len(ys)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F_measure = (2 * recall * precision) / (recall + precision)
    
    return np.array([accuracy, precision, recall, F_measure])


In [14]:
result = np.array([])
# trying k values ranging from [1, 50]
result = knn(1, val_x, val_y)
for k in tqdm(range(2,101)):
    result = np.vstack([result, knn(k, val_x, val_y)])




In [15]:
for k, (accuracy, precision, recall, F_measure) in enumerate(result):
   print("For k = %s, the accuracy: %.4f, "
         "precision: %.4f, recall: %.4f "
         "and F-measure: %.4f" % (k+1, accuracy, precision, recall, F_measure))

For k = 1, the accuracy: 0.5300, precision: 0.5095, recall: 0.9590 and F-measure: 0.6655
For k = 2, the accuracy: 0.5200, precision: 0.5041, recall: 0.9385 and F-measure: 0.6559
For k = 3, the accuracy: 0.5525, precision: 0.5312, recall: 0.6974 and F-measure: 0.6031
For k = 4, the accuracy: 0.5350, precision: 0.5150, recall: 0.7949 and F-measure: 0.6250
For k = 5, the accuracy: 0.5425, precision: 0.5214, recall: 0.7487 and F-measure: 0.6147
For k = 6, the accuracy: 0.5500, precision: 0.5273, recall: 0.7436 and F-measure: 0.6170
For k = 7, the accuracy: 0.5475, precision: 0.5255, recall: 0.7385 and F-measure: 0.6141
For k = 8, the accuracy: 0.4950, precision: 0.4737, recall: 0.3231 and F-measure: 0.3841
For k = 9, the accuracy: 0.5150, precision: 0.5022, recall: 0.5897 and F-measure: 0.5425
For k = 10, the accuracy: 0.5700, precision: 0.5392, recall: 0.8103 and F-measure: 0.6475
For k = 11, the accuracy: 0.5450, precision: 0.5302, recall: 0.5846 and F-measure: 0.5561
For k = 12, the acc

In [20]:
max_f1 = np.max(result[:,3])
f1_pos = np.where(result[:,3] == max_f1)[0][0]

In [21]:
print("The best f1 score is %s with k = %s" % (max_f1, f1_pos + 1))

The best f1 score is 0.665480427046 with k = 1


In [28]:
test_metrics = knn(1, test_x, test_y)

In [29]:
print("For k = %s, the accuracy: %.4f, "
     "precision: %.4f, recall: %.4f "
     "and F-measure: %.4f" % (f1_pos + 1, test_metrics[0], test_metrics[1], test_metrics[2], test_metrics[3]))

For k = 1, the accuracy: 0.4775, precision: 0.4917, recall: 0.7255 and F-measure: 0.5861
