# 04 – Supervised Learning (Classification)

 1. Import Libraries

In [1]:
import os, numpy as np, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

2. Load Dataset

In [2]:
DATA_PATH = Path("C:/Users/youss/Desktop/Heart_Disease_Project/data/heart_disease.csv")
df = pd.read_csv(DATA_PATH)

# define target column
target = 'target' if 'target' in df.columns else df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

# create results folders
os.makedirs("../results", exist_ok=True)
os.makedirs("../models", exist_ok=True)

3. Preprocessing Pipelines

In [3]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

4. Define Models

In [4]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=400, random_state=42),
    "SVM": SVC(probability=True, kernel="rbf", C=1.0, gamma="scale", random_state=42)
}

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

5. Evaluation Function

In [5]:
def evaluate(pipe, name):
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary'
    )
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else float("nan")
    
    # ROC curve plot
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.figure()
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.2f})")
        plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve - {name}")
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig(f"../results/roc_{name}.png")
        plt.close()
    
    return {"model": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}

6. Train & Evaluate Models

In [6]:
rows = []
for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocessor), ("model", model)])
    rows.append(evaluate(pipe, name))

metrics = pd.DataFrame(rows).sort_values("f1", ascending=False)
print("\nModel Evaluation Metrics:\n", metrics)

# save metrics
metrics.to_csv("../results/supervised_metrics.csv", index=False)


Model Evaluation Metrics:
                 model  accuracy  precision    recall        f1       auc
2        RandomForest  1.000000   1.000000  1.000000  1.000000  1.000000
1        DecisionTree  0.985366   1.000000  0.971429  0.985507  0.985714
3                 SVM  0.926829   0.916667  0.942857  0.929577  0.977143
0  LogisticRegression  0.809756   0.761905  0.914286  0.831169  0.929810


7. Save Best Model

In [10]:
best_name = metrics.iloc[0]["model"]
best_model = models[best_name]

best_pipe = Pipeline([("preprocess", preprocessor), ("model", best_model)])
best_pipe.fit(X_train, y_train)

joblib.dump(best_pipe, "../models/final_model.pkl")
print(f"\n Saved best pipeline ({best_name}) to ../models/final_model.pkl")


 Saved best pipeline (RandomForest) to ../models/final_model.pkl
