In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



X_train = pd.read_csv("../Data/Processed/X_train.csv")
X_test  = pd.read_csv("../Data/Processed/X_test.csv")
y_train = pd.read_csv("../Data/Processed/y_train.csv").squeeze()
y_test  = pd.read_csv("../Data/Processed/y_test.csv").squeeze()

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)
print("\nClass distribution in y_train:")
print(y_train.value_counts(normalize=True))



# evaluation function

def evaluate_model(model, X_train, y_train, X_test, y_test, name="Model"):
    """Fit model, print confusion matrix, accuracy, precision, recall, F1, ROC-AUC,
    and full classification report."""
 
    print(f"Evaluating: {name}")


 
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)

    # Probabilities for ROC-AUC
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_proba)
    else:
        y_proba = None
        auc_score = None

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:")
    print(cm)
    print()

    # Basic metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Accuracy : {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")
    print(f"F1-score : {f1:.3f}")
    if auc_score is not None:
        print(f"ROC-AUC  : {auc_score:.3f}")
    print()

    # Full classification report (per-class metrics)
    print("Classification report:")
    print(classification_report(y_test, y_pred, digits=3))
    print("\n\n")



# KNN with Euclidean + standard scaler+ SMOTE + uniform weights (k = 5)

knn_uniform = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("knn", KNeighborsClassifier(
        n_neighbors=5,
        weights="uniform",    
        metric="euclidean"      
    ))
])

evaluate_model(
    knn_uniform,
    X_train, y_train,
    X_test, y_test,
    name="KNN + StandardScaler + SMOTE + Euclidean + UNIFORM weights (k=5)"
)



# KNN with Euclidean + SMOTE + distance weights (k = 5)

knn_distance = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("knn", KNeighborsClassifier(
        n_neighbors=5,
        weights="distance",   
        metric="euclidean"
    ))
])

evaluate_model(
    knn_distance,
    X_train, y_train,
    X_test, y_test,
    name="KNN + StandardScaler +  SMOTE + Euclidean + DISTANCE weights (k=5)"
)



# Grid search over different k and weights with precision, recall, F1 as scoring metrics


knn_pipe = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("knn", KNeighborsClassifier(metric="euclidean"))
])

# Values of k and weights we want to test
param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11],
    "knn__weights": ["uniform", "distance"],
}

# Stratified K-Fold (preserves class ratio in each fold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# our scoring metrics
scoring = {
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

grid = GridSearchCV(
    estimator=knn_pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit="f1",  
    cv=cv,
    n_jobs=-1,
    verbose=2
)

print("Running grid search for KNN now")
grid.fit(X_train, y_train)

best_params = grid.best_params_
best_index = grid.best_index_

best_f1 = grid.cv_results_["mean_test_f1"][best_index]
best_precision = grid.cv_results_["mean_test_precision"][best_index]
best_recall = grid.cv_results_["mean_test_recall"][best_index]

print("GRID SEARCH RESULTS ")
print("Best Parameters:", best_params)
print(f"Best CV F1 score       : {best_f1:.3f}")
print(f"Best CV Precision score: {best_precision:.3f}")
print(f"Best CV Recall score   : {best_recall:.3f}")


best_knn = grid.best_estimator_



# Evaluating the more suitable KNN model on the test set

evaluate_model(
    best_knn,
    X_train, y_train,
    X_test, y_test,
    name=f"Best KNN from GridSearch (k={best_params['knn__n_neighbors']}, "
         f"weights={best_params['knn__weights']})"
)

Train shape: (9864, 58)
Test shape : (2466, 58)

Class distribution in y_train:
Revenue
False    0.845296
True     0.154704
Name: proportion, dtype: float64
Evaluating: KNN + StandardScaler + SMOTE + Euclidean + UNIFORM weights (k=5)
Confusion matrix:
[[1377  707]
 [ 197  185]]

Accuracy : 0.633
Precision: 0.207
Recall   : 0.484
F1-score : 0.290
ROC-AUC  : 0.613

Classification report:
              precision    recall  f1-score   support

       False      0.875     0.661     0.753      2084
        True      0.207     0.484     0.290       382

    accuracy                          0.633      2466
   macro avg      0.541     0.573     0.522      2466
weighted avg      0.771     0.633     0.681      2466




Evaluating: KNN + StandardScaler +  SMOTE + Euclidean + DISTANCE weights (k=5)
Confusion matrix:
[[1430  654]
 [ 213  169]]

Accuracy : 0.648
Precision: 0.205
Recall   : 0.442
F1-score : 0.280
ROC-AUC  : 0.612

Classification report:
              precision    recall  f1-score   s

In [3]:

# Evaluation function 

def evaluate_model(model, X_train, y_train, X_test, y_test, name="Model"):
 
    print(f"Evaluating: {name}")
   

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    else:
        auc = None

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:")
    print(cm)
    print()

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Accuracy : {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")
    print(f"F1-score : {f1:.3f}")
    if auc is not None:
        print(f"ROC-AUC  : {auc:.3f}")
    print()

    print("Classification report:")
    print(classification_report(y_test, y_pred, digits=3))
    print("\n\n")



#  KNN + Standard Scaler + SMOTE + Manhattan + UNIFORM weights (k=5)

knn_manhattan_uniform = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("knn", KNeighborsClassifier(
        n_neighbors=5,
        weights="uniform",
        metric="manhattan"
    ))
])

evaluate_model(
    knn_manhattan_uniform,
    X_train, y_train,
    X_test, y_test,
    name="KNN + SMOTE + Manhattan + UNIFORM weights (k=5)"
)



# KNN +Standard Scaler + SMOTE + Manhattan + DISTANCE weights (k=5)

knn_manhattan_distance = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("knn", KNeighborsClassifier(
        n_neighbors=5,
        weights="distance",
        metric="manhattan"
    ))
])

evaluate_model(
    knn_manhattan_distance,
    X_train, y_train,
    X_test, y_test,
    name="KNN + SMOTE + Manhattan + DISTANCE weights (k=5)"
)



# Grid Search for Manhattan Distance KNN

knn_pipe_manhattan = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("knn", KNeighborsClassifier(metric="manhattan"))
])

param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11],
    "knn__weights": ["uniform", "distance"],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

grid_manhattan = GridSearchCV(
    estimator=knn_pipe_manhattan,
    param_grid=param_grid,
    scoring=scoring,
    refit="recall",          
    cv=cv,
    n_jobs=-1,
    verbose=2
)

print("Running Manhattan KNN grid search now")
grid_manhattan.fit(X_train, y_train)

best_params = grid_manhattan.best_params_
best_idx = grid_manhattan.best_index_

best_f1 = grid_manhattan.cv_results_["mean_test_f1"][best_idx]
best_prec = grid_manhattan.cv_results_["mean_test_precision"][best_idx]
best_recall = grid_manhattan.cv_results_["mean_test_recall"][best_idx]

print("GRID SEARCH RESULTS (Manhattan)")
print("Best Parameters:", best_params)
print(f"Best CV Recall        : {best_recall:.3f}")
print(f"Best CV Precision     : {best_prec:.3f}")
print(f"Best CV F1            : {best_f1:.3f}")


best_knn_manhattan = grid_manhattan.best_estimator_



#  Evaluate the best Manhattan KNN model
evaluate_model(
    best_knn_manhattan,
    X_train, y_train,
    X_test, y_test,
    name=f"Best KNN Manhattan (k={best_params['knn__n_neighbors']}, weights={best_params['knn__weights']})"
)


Evaluating: KNN + SMOTE + Manhattan + UNIFORM weights (k=5)
Confusion matrix:
[[1477  607]
 [ 215  167]]

Accuracy : 0.667
Precision: 0.216
Recall   : 0.437
F1-score : 0.289
ROC-AUC  : 0.622

Classification report:
              precision    recall  f1-score   support

       False      0.873     0.709     0.782      2084
        True      0.216     0.437     0.289       382

    accuracy                          0.667      2466
   macro avg      0.544     0.573     0.536      2466
weighted avg      0.771     0.667     0.706      2466




Evaluating: KNN + SMOTE + Manhattan + DISTANCE weights (k=5)
Confusion matrix:
[[1520  564]
 [ 231  151]]

Accuracy : 0.678
Precision: 0.211
Recall   : 0.395
F1-score : 0.275
ROC-AUC  : 0.619

Classification report:
              precision    recall  f1-score   support

       False      0.868     0.729     0.793      2084
        True      0.211     0.395     0.275       382

    accuracy                          0.678      2466
   macro avg      0.5

In [None]:
from sklearn.metrics import precision_recall_curve, auc

# Using fitted best KNN model from the grid search 
if not hasattr(best_knn, "predict_proba"):
	raise AttributeError("Selected model 'best_knn' does not support predict_proba().")

knn_probs = best_knn.predict_proba(X_test)[:, 1]


precision, recall, thresholds = precision_recall_curve(y_test, knn_probs)

pr_auc_knn = auc(recall, precision)

print(f"KNN PR-AUC: {pr_auc_knn:.3f}")

KNN PR-AUC: 0.251
