In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from tqdm import tqdm_notebook as tqdm

In [2]:
data_path = "data/"
gen_data_path = "generated/"
df_tr_file = "DS2_train"
df_test_file = "DS2"
df_val_file = "DS2_val"

In [3]:
df_test = pd.read_csv(gen_data_path + df_test_file, index_col = 'Unnamed: 0')
df_val = pd.read_csv(gen_data_path + df_val_file, index_col = 'Unnamed: 0')
df_tr = pd.read_csv(gen_data_path + df_tr_file, index_col = 'Unnamed: 0')

In [4]:
def get_x_y_values(dataset):
    return dataset.drop('class', axis = 1).values, dataset['class'].values

In [5]:
test_x, test_y = get_x_y_values(df_test)
val_x, val_y = get_x_y_values(df_val)
train_x, train_y = get_x_y_values(df_tr)

In [6]:
def get_dist(a, b):
    return distance.cdist(a, b, metric='euclidean')

def knn(k, xs, ys):
    TN, TP, FN, FP = 0, 0, 0, 0
    distances_indexes = np.array([])
    for x, y in zip(xs, ys):
        d = get_dist([x], train_x).flatten().flatten().argsort()[:k]
        distances_indexes = np.append(distances_indexes, d)
        
        # find the class of val_x based on k-nearest neighbors
        classes = [(train_y[int(i)]) for i in distances_indexes]
        y_pred = int(round(sum(classes)/len(classes)))
        
        if y_pred == y:
            if y_pred:
                TP += 1
            else: 
                TN += 1
        else: 
            if y_pred:
                FP += 1
            else: 
                FN += 1
            
    accuracy = (TN + TP)/len(ys)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F_measure = (2 * recall * precision) / (recall + precision)
    
    return np.array([accuracy, precision, recall, F_measure])


In [8]:
result = np.array([])
# trying k values ranging from [1, 50]
result = knn(1, val_x, val_y)
for k in tqdm(range(2,101)):
    result = np.vstack([result, knn(k, val_x, val_y)])





In [15]:
for k, (accuracy, precision, recall, F_measure) in enumerate(result):
   print("For k = %s, the accuracy: %.4f, "
         "precision: %.4f, recall: %.4f "
         "and F-measure: %.4f" % (k+1, accuracy, precision, recall, F_measure))

For k = 1, the accuracy: 0.4850, precision: 0.4696, recall: 0.2714 and F-measure: 0.3439
For k = 2, the accuracy: 0.5150, precision: 0.5110, recall: 0.5829 and F-measure: 0.5446
For k = 3, the accuracy: 0.4950, precision: 0.4948, recall: 0.7186 and F-measure: 0.5861
For k = 4, the accuracy: 0.5150, precision: 0.5309, recall: 0.2161 and F-measure: 0.3071
For k = 5, the accuracy: 0.5000, precision: 0.4986, recall: 0.8894 and F-measure: 0.6390
For k = 6, the accuracy: 0.4975, precision: 0.4975, recall: 0.9899 and F-measure: 0.6622
For k = 7, the accuracy: 0.4975, precision: 0.4975, recall: 0.9950 and F-measure: 0.6633
For k = 8, the accuracy: 0.5000, precision: 0.4987, recall: 0.9899 and F-measure: 0.6633
For k = 9, the accuracy: 0.5075, precision: 0.5027, recall: 0.9497 and F-measure: 0.6574
For k = 10, the accuracy: 0.5175, precision: 0.5086, recall: 0.8894 and F-measure: 0.6472
For k = 11, the accuracy: 0.4975, precision: 0.4975, recall: 0.9899 and F-measure: 0.6622
For k = 12, the acc

In [9]:
max_f1 = np.max(result[:,3])
f1_pos = np.where(result[:,3] == max_f1)[0][0]

In [10]:
print("The best f1 score is %s with k = %s" % (max_f1, f1_pos + 1))

The best f1 score is 0.669014084507 with k = 37


In [13]:
test_metrics = knn(37, test_x, test_y)

In [14]:
print("For k = %s, the accuracy: %.4f, "
     "precision: %.4f, recall: %.4f "
     "and F-measure: %.4f" % (f1_pos + 1, test_metrics[0], test_metrics[1], test_metrics[2], test_metrics[3]))

For k = 37, the accuracy: 0.4800, precision: 0.4875, recall: 0.8794 and F-measure: 0.6272
