In [44]:
# Imports
import numpy as np
import pandas as pd
from knn_model import k_nearest_neighbor_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [45]:
# Read the data
data = pd.read_csv('data/medical.csv')

# Add the target column as requested
data['RISK'] = [1 if x >= 1.00 else 0 for x in data['BEDAYNTM']]

print(data.head())

   ID  TUOI  BMI   HA  GLUCOSE  CHOLESTEROL  BEDAYNTM  RISK
0   1    56   21  160     14.0         6.00      1.95     1
1   2    76   18  150     12.0         4.97      1.33     1
2   3    63   16  160      4.4         6.39      0.83     0
3   4    78   20  100      4.0         7.00      2.00     1
4   5    87   20  110      4.6         4.10      1.30     1


In [46]:
X = data[['TUOI', 'BMI', 'HA', 'GLUCOSE', 'CHOLESTEROL']].values
y = data['RISK'].values

# Create training set and test set (8:2 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
k_values = list(range(1, 21))

best_k = None
best_accuracy = 0.0
best_precision = 0.0
best_recall = 0.0
best_conf_matrix = None

for k in k_values:
    y_pred = k_nearest_neighbor_predict(X_train, y_train, X_test, k=k)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy
        best_precision = precision
        best_recall = recall
        best_conf_matrix = conf_matrix

# Print the best-performing model
print("\nBest-Performing K-NN Model:")
print(f"k = {best_k}")
print("Accuracy:", best_accuracy)
print("Precision:", best_precision)
print("Recall:", best_recall)
print("Confusion Matrix:\n", best_conf_matrix)


Best-Performing K-NN Model:
k = 13
Accuracy: 0.8
Precision: 0.7777777777777778
Recall: 0.7777777777777778
Confusion Matrix:
 [[9 2]
 [2 7]]
