In [None]:
# --- Data Handling ---
import pandas as pd
import numpy as np
import os
import joblib

# --- Machine Learning Models ---
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# --- Model Evaluation ---
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

# --- Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set_style('whitegrid')

print("Libraries imported successfully!")

# --- Load the Processed Data and Scaler ---
PROCESSED_DATA_DIR = os.path.join("..", "data", "heart_disease", "processed")

X_train = joblib.load(os.path.join(PROCESSED_DATA_DIR, "X_train.joblib"))
X_test = joblib.load(os.path.join(PROCESSED_DATA_DIR, "X_test.joblib"))
y_train = joblib.load(os.path.join(PROCESSED_DATA_DIR, "y_train.joblib"))
y_test = joblib.load(os.path.join(PROCESSED_DATA_DIR, "y_test.joblib"))

print("\nData loaded successfully!")
print(f"X_train shape: {X_train.shape}")

In [None]:
# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("--- Logistic Regression Results ---")
print(classification_report(y_test, y_pred_log_reg))

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred_log_reg)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# --- Random Forest ---
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("\n--- Random Forest Results ---")
print(classification_report(y_test, y_pred_rf))

# --- XGBoost (Updated line) ---
# We removed use_label_encoder=False as it's deprecated
xgb_clf = XGBClassifier(eval_metric='logloss', random_state=42) 
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)

print("\n--- XGBoost Results ---")
print(classification_report(y_test, y_pred_xgb))

In [None]:
# Get prediction probabilities for each model
y_prob_log_reg = log_reg.predict_proba(X_test)[:, 1]
y_prob_rf = rf_clf.predict_proba(X_test)[:, 1]
y_prob_xgb = xgb_clf.predict_proba(X_test)[:, 1]

# Calculate ROC curve data
fpr_log_reg, tpr_log_reg, _ = roc_curve(y_test, y_prob_log_reg)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)

# Calculate AUC scores
auc_log_reg = roc_auc_score(y_test, y_prob_log_reg)
auc_rf = roc_auc_score(y_test, y_prob_rf)
auc_xgb = roc_auc_score(y_test, y_prob_xgb)

# Plot the ROC curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_log_reg, tpr_log_reg, label=f'Logistic Regression (AUC = {auc_log_reg:.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.3f})')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.3f})', linewidth=3)

# Plot the "no-skill" line
plt.plot([0, 1], [0, 1], 'k--', label='No Skill (AUC = 0.5)')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve Comparison')
plt.legend()
plt.show()

print(f"Logistic Regression AUC: {auc_log_reg:.3f}")
print(f"Random Forest AUC: {auc_rf:.3f}")
print(f"XGBoost AUC: {auc_xgb:.3f}")

In [None]:
# --- Step 5: Choose and Save the Champion Model (CORRECTED) ---

# Based on our results, Random Forest is the champion model.
best_model = rf_clf  # <-- This is the only change needed

# Define the path to save the model
MODEL_PATH = os.path.join("..", "models", "heart_disease", "best_heart_disease_classifier.joblib")

# Save the model
joblib.dump(best_model, MODEL_PATH)

print(f"\nBest model ({type(best_model).__name__}) saved successfully to: {MODEL_PATH}")