In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [9]:
df = pd.read_csv("drug_200.csv")

# Encode categorical features
for col in ['Sex', 'BP', 'Cholesterol']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Separate features and target
X = df.drop('Drug', axis=1).values
y = LabelEncoder().fit_transform(df['Drug'])

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [10]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        preds = []
        for x in X_test:
            # Compute Euclidean distances
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
            # Get indices of k nearest neighbors
            k_idx = np.argsort(distances)[:self.k]
            k_labels = self.y_train[k_idx]
            # Majority voting
            vals, counts = np.unique(k_labels, return_counts=True)
            preds.append(vals[np.argmax(counts)])
        return np.array(preds)


In [11]:
def evaluate_knn(k):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accs, precs, recs, f1s = [], [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = KNN(k=k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred))
        precs.append(precision_score(y_test, y_pred, average='macro', zero_division=0))
        recs.append(recall_score(y_test, y_pred, average='macro', zero_division=0))
        f1s.append(f1_score(y_test, y_pred, average='macro', zero_division=0))

    return {
        "k": k,
        "accuracy": np.mean(accs),
        "precision": np.mean(precs),
        "recall": np.mean(recs),
        "f1": np.mean(f1s)
    }

In [12]:
results = [evaluate_knn(k) for k in [1, 3, 5]]

In [13]:
results_df = pd.DataFrame(results)
print("\nPerformance Comparison (5-Fold Average):")
print(results_df.round(4))


Performance Comparison (5-Fold Average):
   k  accuracy  precision  recall      f1
0  1      0.87     0.8198  0.8821  0.8307
1  3      0.81     0.7899  0.8119  0.7807
2  5      0.75     0.7153  0.7731  0.7237
