In [None]:
import os
import sys
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)
sys.path.append("../src")

from preprocessing import load_unsw_data, preprocess_data

In [None]:
df = load_unsw_data("../data/")
X_train, X_test, y_train, y_test = preprocess_data(df)
print(f"[✔] Loaded dataset. Test shape: {X_test.shape}")

In [None]:
models_dir = "../outputs/models/"
model_files = {
    "Random Forest": "random_forest.pkl",
    "Logistic Regression": "logistic_regression.pkl",
    "XGBoost": "xgboost.pkl",
    "LightGBM": "lightgbm.pkl"
}

models = {}
for name, file in model_files.items():
    path = os.path.join(models_dir, file)
    if os.path.exists(path):
        models[name] = joblib.load(path)
        print(f"[✔] Loaded {name}")
    else:
        print(f"[⚠] {file} not found. Skipping...")

In [None]:
results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted")
    rec = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1
    })

results_df = pd.DataFrame(results).sort_values(by="F1-score", ascending=False)
results_df.reset_index(drop=True, inplace=True)
print("\n=== Summary ===")
display(results_df)

In [None]:
plt.figure(figsize=(9,5))
sns.barplot(x="Model", y="Accuracy", data=results_df, hue="Model", dodge=False)
plt.title("Model Comparison by Accuracy")
plt.ylim(0.5, 1.0)
plt.tight_layout()
plt.savefig("../outputs/figures/model_accuracy_comparison.png", dpi=300)
plt.show()

plt.figure(figsize=(9,5))
sns.barplot(x="Model", y="F1-score", data=results_df, hue="Model", dodge=False)
plt.title("Model Comparison by F1-score")
plt.ylim(0.5, 1.0)
plt.tight_layout()
plt.savefig("../outputs/figures/model_f1_comparison.png", dpi=300)
plt.show()

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(f"../outputs/figures/{name.replace(' ', '_')}_confusion_matrix.png", dpi=300)
    plt.show()

In [None]:
best_model = results_df.iloc[0]
print(f"Best Model: {best_model['Model']}")
print(f"Accuracy: {best_model['Accuracy']:.4f}")
print(f"F1-score: {best_model['F1-score']:.4f}")