In [1]:
# import usual libraries for machine learing and data science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Read CSV

In [9]:
import os

file_list = os.listdir("Dataset")
file_list = [file.replace(".csv", "") for file in file_list]

# put file names in file_list that have world splitTrain to file_list_train
file_list_train = [file for file in file_list if "splitTrain" in file]
file_list_test = [file for file in file_list if "splitTest" in file]

data_train = {}
for file in file_list_train:
    data_train[file.replace("splitTrain_", "")] = pd.read_csv("Dataset/" + file + ".csv")

data_test = {}
for file in file_list_test:
    data_test[file.replace("splitTest_", "")] = pd.read_csv("Dataset/" + file + ".csv")


# KNN sklearn

In [60]:
# import f1 score metric from sklearn
from sklearn.metrics import f1_score

# function to get dictionary of f1 score prediction for each data train and data test using KNeighborsClassifier
def get_f1_score(data_train, data_test, k):
    list_f1score = {}
    for key in data_train:
        if ("stdScaled" not in key):
            X_train = data_train[key].drop(["HeartDisease"], axis=1)
            y_train = data_train[key]["HeartDisease"]
            X_test = data_test[key].drop(["HeartDisease"], axis=1)
            y_test = data_test[key]["HeartDisease"]

            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            list_f1score[key] = f1_score(y_test, y_pred)
            
    return list_f1score

In [69]:
list_f1_score_knnsklearn = get_f1_score(data_train, data_test, k=5)
list_f1_score_knnsklearn = pd.DataFrame(list_f1_score_knnsklearn.items(), columns=["Dataset", "F1 Score"])

In [74]:
# sort list_f1_score_knnsklearn by f1 score
list_f1_score_knnsklearn = list_f1_score_knnsklearn.sort_values(by="F1 Score", ascending=False)
list_f1_score_knnsklearn

Unnamed: 0,Dataset,F1 Score
0,df_encoded_minmaxScaled,0.845455
5,df_DF_deleteOutlier_encoded_minmaxScaled,0.838235
3,df_DF_encoded_minmaxScaled,0.827907
2,df_deleteOutlier_encoded_minmaxScaled,0.827068
4,df_DF_modifiedOutlier_encoded_minmaxScaled,0.816327
1,df_modifiedOutlier_encoded_minmaxScaled,0.808511


# KNN from scratch

2.081257965385335

In [75]:
from collections import Counter

def minkowski_distance(x, y, p):
    # return the minkowski distance between two points
    return np.linalg.norm(x - y, ord=p)

# create class that represent k-nearest-neighbors
class KNN:
    def __init__(self, k=5, p = 2):
        self.k = k
        self.p = p
        self.distance_function = minkowski_distance
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        y_pred = np.zeros(len(X))
        for i in range(len(X)):
            # find the nearest neighbors of the point x
            distances = np.array([self.distance_function(X.iloc[i, :], self.X_train.iloc[j, :], p=self.p) for j in range(len(self.X_train))])
            k_minimum_distances_indices = np.argsort(distances)[:self.k]
            k_minimum_distances_labels = self.y_train.iloc[k_minimum_distances_indices].to_numpy()
            counter = Counter(k_minimum_distances_labels)
            y_pred[i] = counter.most_common(1)[0][0]
        return y_pred

In [76]:
def get_f1_score_knn_scratch(data_train, data_test, k):
    list_f1score = {}
    for key in data_train:
        if ("stdScaled" not in key):
            X_train = data_train[key].drop(["HeartDisease"], axis=1)
            y_train = data_train[key]["HeartDisease"]
            X_test = data_test[key].drop(["HeartDisease"], axis=1)
            y_test = data_test[key]["HeartDisease"]

            knn = KNN(k=k)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            list_f1score[key] = f1_score(y_test, y_pred)
            
    return list_f1score

In [77]:
list_f1_score_knnscratch = get_f1_score_knn_scratch(data_train, data_test, k=5)

In [78]:
# make list_f1_score_knnscratch a dataframe
list_f1_score_knnscratch = pd.DataFrame(list_f1_score_knnscratch.items(), columns=["Dataset", "F1 Score"])

# sort list_f1_score_knnscratch by f1 score
list_f1_score_knnscratch = list_f1_score_knnscratch.sort_values(by="F1 Score", ascending=False)

In [79]:
list_f1_score_knnscratch

Unnamed: 0,Dataset,F1 Score
0,df_encoded_minmaxScaled,0.845455
5,df_DF_deleteOutlier_encoded_minmaxScaled,0.838235
3,df_DF_encoded_minmaxScaled,0.827907
2,df_deleteOutlier_encoded_minmaxScaled,0.827068
4,df_DF_modifiedOutlier_encoded_minmaxScaled,0.816327
1,df_modifiedOutlier_encoded_minmaxScaled,0.808511


# Eksperimen

In [83]:
X_train = data_train["df_encoded_minmaxScaled"].drop(["HeartDisease"], axis=1)
y_train = data_train["df_encoded_minmaxScaled"]["HeartDisease"]
X_test = data_test["df_encoded_minmaxScaled"].drop(["HeartDisease"], axis=1)
y_test = data_test["df_encoded_minmaxScaled"]["HeartDisease"]

In [None]:
# efek nilai K dengan akurasi
k_range = list(range(3, 30, 2))

scores = []
for k in k_range:
    knn = KNN(k=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))

# plot the scores using sns
sns.lineplot(k_range, scores)

In [None]:
p_list = [1, 2, 3, 4, 5]

scores = []
for p in p_list:
    knn = KNN(k=5, p=p)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))

# plot the scores using sns
sns.lineplot(p_list, scores)