In [6]:
import pandas as pd
import numpy as np
import joblib
import urllib.request
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# URL of the processed Cleveland dataset (most commonly used)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

# Save as CSV file
output_file = "Data/heart_disease_uci.csv"

# Check if file exists
if not os.path.exists(output_file):
    print("File not found. Downloading...")
    urllib.request.urlretrieve(url, output_file)
    print("Download complete! File saved as heart_disease_uci.csv")
else:
    print("File heart_disease_uci.csv already present")



# ================================
# Load Dataset
# ================================

# Column names as per UCI documentation
columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

data = pd.read_csv("Data/heart_disease_uci.csv", names=columns)

data.head()

# Remove missing values
data.dropna(inplace=True)

# Binary classification
data["target"] = (data["target"] > 0).astype(int)

# One-hot encode categorical features
data = pd.get_dummies(data, drop_first=True)

X = data.drop("target", axis=1)
y = data["target"]

# ================================
# Train-Test Split
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Save raw test dataset BEFORE scaling
X_test_raw = pd.DataFrame(X_test, columns=X.columns)
X_test_raw["target"] = y_test.values

X_test_raw.to_csv("heart_test_final.csv", index=False)

# ================================
# Feature Scaling
# ================================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ================================
# Evaluation Function
# ================================
def evaluate(model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    return [
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ]

# ================================
# Models
# ================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        eval_metric="logloss"
    )
}

# ================================
# Train & Evaluate
# ================================
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    results[name] = evaluate(model)

# ================================
# Results Table
# ================================
results_df = pd.DataFrame(
    results,
    index=["Accuracy", "AUC", "Precision", "Recall", "F1 Score", "MCC"]
).T

print(results_df)


# ================================
# Save scaler (VERY IMPORTANT)
# ================================
joblib.dump(scaler, "Model/scaler.pkl")

# ================================
# Save each trained model
# ================================a
for name, model in models.items():
    # Create a safe filename
    file_name = name.lower().replace(" ", "_") + ".pkl"
    path = os.path.join("Model", file_name)
    
    joblib.dump(model, path)
    #print(f"Saved: {path}")

File heart_disease_uci.csv already present
                     Accuracy       AUC  Precision    Recall  F1 Score  \
Logistic Regression  0.819672  0.928571   0.742857  0.928571  0.825397   
Decision Tree        0.704918  0.708333   0.656250  0.750000  0.700000   
KNN                  0.852459  0.916667   0.771429  0.964286  0.857143   
Naive Bayes          0.540984  0.813853   0.500000  0.214286  0.300000   
Random Forest        0.868852  0.938312   0.812500  0.928571  0.866667   
XGBoost              0.819672  0.904762   0.774194  0.857143  0.813559   

                          MCC  
Logistic Regression  0.660870  
Decision Tree        0.415768  
KNN                  0.727393  
Naive Bayes          0.040700  
Random Forest        0.745142  
XGBoost              0.642938  
