In [325]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from collections import Counter


## PREPROCESSING

In [445]:
# Category features
cont_cols  = ['age', 'chol', 'oldpeak', 'thalch', 'trestbps']
cat_cols = ['ca', 'cp', 'restecg', 'slope', 'thal', 'sex', 'fbs', 'exang']

#read training data, normalize continous features using (X - mean) / std 
df = pd.read_csv('heart_disease_uci.csv')

# Create a new dataframe from original and change all target variables greater than 0 to 1, instead of 1,2,3,4,5
df_fix = df
df_fix['num'] = np.where(df_fix['num'] > 0, 1, 0)

# Seperate features (X) and target (y), removing id, num and dataset from X
X = df_fix.drop(columns=['id', 'num', 'dataset'])
y = df_fix['num']

# Replace missing data in categorical columns with unknown
for cols in cat_cols:
    X[cols] = X[cols].fillna('Unknown')

# One-hot encoding for categorical columns
X = pd.get_dummies(X, columns=cat_cols)

# Convert continuous column values to float
X[cont_cols] = X.loc[:, cont_cols].astype('float64')

# find the mean and standard deviation of each continuous column
train_means = X[cont_cols].mean()
train_std = X[cont_cols].std()

# replace any missing value in a continuous column with the mean of that column
X.loc[:, cont_cols] = X[cont_cols].fillna(train_means)

# Normalize data
X.loc[:, cont_cols] = (X.loc[:, cont_cols] - train_means) / train_std


## KNN CLASSIFIER


In [439]:
# shuffling data before splitting for randomized train, valid, test sets
shuffled = df_fix.sample(frac=1, random_state=42).reset_index(drop=True)

# separate data into training, validation, and testing sets
train_split = int(0.6 * len(X))
test_split = int(train_split + (0.2 * len(X)))
X_train = X[:train_split]
X_valid = X[train_split:test_split]
X_test = X[test_split:]

y_train = y[:train_split]
y_valid = y[train_split:test_split]
y_test = y[test_split:]

# convert to numpy arrays to more easily apply k-NN formulas
X_train = X_train.values
X_valid = X_valid.values
X_test = X_test.values

y_train = y_train.values
y_valid = y_valid.values
y_test = y_test.values


In [440]:
# Calculate the L2 distance between the current row and a neighbor
def l2_distance(x, X_train):
    return np.sum((X_train - x)**2, axis=1)
    
# get the K closest neighbors
def knn_predict(x, X_train, y_train, k):
    distances = l2_distance(x, X_train)
    idx = np.argpartition(distances, k)[:k]  # indices of k nearest
    neighbor_labels = y_train[idx]
    counts = Counter(neighbor_labels.tolist()).most_common(1)  
    return counts[0][0]  # majority vote

# loop over all rows
def knn_predict_batch(X_test, X_train, y_train, k):
    preds = [knn_predict(x, X_train, y_train, k) for x in X_test]
    return np.array(preds)


In [441]:
def printConfusionMatrix(tp, fp, tn, fn):
    print("\n%15sActual" % "")
    print("%6s %7s %7s" % ("", "1", "0"))
    print("P%6s +--------+--------+" % "")
    print("r%6s | %-6s | %-6s |" % ("1", 'TP='+str(tp), 'FP='+str(fp)))
    print("e%6s +--------+--------+" % "")
    print("d%6s | %-6s | %-6s |" % ("0", 'FN='+str(fn), 'TN='+str(tn)))
    print(".%6s +--------+--------+\n" % "")

def getConfusionMatrix(y_true, y_pred):

    tp = ((y_pred == 1) & (y_true == 1)).sum()
    tn = ((y_pred == 0) & (y_true == 0)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    
    return tp, fp, tn, fn

def getAccuracy(tp, fp, tn, fn):
    return (tp + tn) / (tp + tn + fp + fn)

def getPrecision(tp, fp, tn, fn):
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def getRecall(tp, fp, tn, fn):
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def getFScore(tp, fp, tn, fn, beta=1):
    precision = getPrecision(tp, fp, tn, fn)
    recall = getRecall(tp, fp, tn, fn)
    if (precision + recall) == 0:
        return 0
    return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall) 


In [442]:
# fine-tuning k value on validation set
best_k, best_score = None, -1

for k in range(1, 25, 2):
    preds = knn_predict_batch(X_valid, X_train, y_train, k)
    acc = (preds == y_valid).mean()
    
    if acc > best_score:
        best_score, best_k = acc, k

print('Best k:', best_k)
print('Validation Accuracy:', best_score)
    

Best k: 13
Validation Accuracy: 0.5706521739130435


In [443]:
# report final performance

tp, fp, tn, fn = getConfusionMatrix(y_test, knn_predict_batch(X_test, X_train, y_train, best_k))
printConfusionMatrix(tp, fp, tn, fn)
                
print('Accuracy:  %8.5f' % getAccuracy(tp, fp, tn, fn))
print('Precison:  %8.5f' % getPrecision(tp, fp, tn, fn))
print('Recall:    %8.5f' % getRecall(tp, fp, tn, fn))
print('F1 Score:  %8.5f' % getFScore(tp, fp, tn, fn))  


               Actual
             1       0
P       +--------+--------+
r     1 | TP=83  | FP=9   |
e       +--------+--------+
d     0 | FN=56  | TN=36  |
.       +--------+--------+

Accuracy:   0.64674
Precison:   0.90217
Recall:     0.59712
F1 Score:   0.71861
