In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from tqdm import tqdm_notebook as tqdm

In [2]:
data_path = "data/"
gen_data_path = "generated/"
df_tr_file = "DS1_train"
df_test_file = "DS1"
df_val_file = "DS1_val"

In [3]:
df_test = pd.read_csv(gen_data_path + df_test_file, index_col = 'Unnamed: 0')
df_val = pd.read_csv(gen_data_path + df_val_file, index_col = 'Unnamed: 0')
df_tr = pd.read_csv(gen_data_path + df_tr_file, index_col = 'Unnamed: 0')

In [4]:
df_tr.shape

(1200, 21)

In [5]:
def get_x_y_values(dataset):
    return dataset.drop('class', axis = 1).values, dataset['class'].values

In [6]:
test_x, test_y = get_x_y_values(df_test)
val_x, val_y = get_x_y_values(df_val)
train_x, train_y = get_x_y_values(df_tr)

In [11]:
def get_dist(a, b):
    return distance.cdist(a, b, metric='euclidean')

def knn(k, xs, ys):
    TN, TP, FN, FP = 0, 0, 0, 0
    distances_indexes = np.array([])
    for x, y in zip(xs, ys):
        d = get_dist([x], train_x).flatten().flatten().argsort()[:k]
        distances_indexes = np.append(distances_indexes, d)
        
        # find the class of val_x based on k-nearest neighbors
        classes = [(train_y[int(i)]) for i in distances_indexes]
        y_pred = int(round(sum(classes)/len(classes)))
        
        if y_pred == y:
            if y_pred:
                TP += 1
            else: 
                TN += 1
        else: 
            if y_pred:
                FP += 1
            else: 
                FN += 1
            
    accuracy = (TN + TP)/len(ys)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F_measure = (2 * recall * precision) / (recall + precision)
    
    return accuracy, precision, recall, F_measure


In [12]:
result = []
# trying k values ranging from [1, 100)
for k in tqdm(range(1,100)):
    result.append(knn(k, val_x, val_y))





In [17]:
# for k, (accuracy, precision, recall, F_measure) in enumerate(result):
#    print("For k = %s, the accuracy: %.4f, "
#          "precision: %.4f, recall: %.4f "
#          "and F-measure: %.4f" % (k+1, accuracy, precision, recall, F_measure))

'\nfor k, (accuracy, precision, recall, F_measure) in enumerate(result):\n    print("For k = %s, the accuracy: %.4f, "\n          "precision: %.4f, recall: %.4f "\n          "and F-measure: %.4f" % (k+1, accuracy, precision, recall, F_measure))\n'