In [None]:
# ===============================================
# Import Libraries
# ===============================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib # Import joblib for model saving

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# ===============================================
# Dataload
# ===============================================
df = pd.read_csv("heart.csv")
df.head()


In [None]:
# ===============================================
# 3. Basic Exploration
# ===============================================
print(df.shape)
df.info()
df.describe()
df.isnull().sum()


# Data Cleaning: Remove duplicates to prevent biased results
df.drop_duplicates(inplace=True)

# Separate target from features
X = df.drop('target', axis=1)
y = df['target']




In [None]:
# ===============================================
# 4. Trainâ€“Test Split
# ===============================================

#Preprocessing: Scaling is crucial for accuracy
# This prevents features with large numbers (like cholesterol) from dominating
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Split the data (80% Train, 20% Test)
# Stratify=y ensures both sets have the same percentage of heart disease cases
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

#Create the 'model' folder (This creates it automatically!)
os.makedirs('model', exist_ok=True)


In [None]:
# ===============================================
# 5. Feature Scaling
# ===============================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# ===============================================
# 6. Define Models
# ===============================================

models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "XGBoost": XGBClassifier(eval_metric="logloss",) # Using GradientBoosting as an alternative to XGBoost
}


In [None]:

# Save the Scaler into the folder
joblib.dump(scaler, os.path.join("model", "scaler.pkl"))
print("Saved: model/scaler.pkl")

#  Storage for metrics
metrics_list = []

print("--- DETAILED MODEL REPORTS ---")
for name, model in models.items():
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else y_pred

    # Save Model
    joblib.dump(model, f"model/{name.lower().replace(' ', '_')}.pkl")
    
    # Print Classification Report
    print(f"\n{'='*40}\nMODEL: {name}\n{'='*40}")
    print("CLASSIFICATION REPORT:")
    print(classification_report(y_test, y_pred))
    
    # Plot and Save Confusion Matrix
    plt.figure(figsize=(5, 4))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.savefig(f"cm_{name.lower().replace(' ', '_')}.png") # Saves the plot
    plt.show()
    
    # Store Metrics
    metrics_list.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

In [None]:
# Create Comparison Table (This replaces your results_df logic)
results_df = pd.DataFrame(metrics_list).sort_values(by='Accuracy', ascending=False)

print("\n" + "="*80)
print("FINAL PERFORMANCE COMPARISON TABLE")
print("="*80)
print(results_df.to_string(index=False))


# Heatmap of all metrics
plt.figure(figsize=(12, 6))
plot_df = results_df.set_index('Model')
sns.heatmap(plot_df, annot=True, cmap="Blues", fmt=".3f")
plt.title("Model Performance Comparison (All Metrics)")
plt.show()