## 1. Setup and Imports

In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)

import joblib
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# Settings
sns.set(style="whitegrid")
%matplotlib inline

print("✅ All libraries imported successfully")

## 2. Configuration

In [None]:
# Paths
DATA_PATH = "data/heart_cleaned.csv"
OUTPUT_DIR = "screenshots"
MODELS_DIR = "models"
RESULTS_DIR = "results"

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Random state
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# MLflow setup
EXPERIMENT_NAME = "Heart-Disease-Classification-Notebook"
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"Experiment: {EXPERIMENT_NAME}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

## 3. Load Data

In [None]:
# Load cleaned dataset
df = pd.read_csv(DATA_PATH)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nClass distribution:")
print(df['target'].value_counts())

df.head()

## 4. Feature Engineering

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Train samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
scaler_path = f"{MODELS_DIR}/scaler.pkl"
joblib.dump(scaler, scaler_path)

# Convert to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("✅ Feature scaling completed")

## 5. Model Training with MLflow - Logistic Regression

In [None]:
# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

with mlflow.start_run(run_name="Logistic_Regression_Notebook") as run:
    print(f"MLflow Run ID: {run.info.run_id}\n")
    
    # Log parameters
    mlflow.log_param("model_type", "Logistic Regression")
    mlflow.log_param("n_samples", len(df))
    mlflow.log_param("n_features", X.shape[1])
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", RANDOM_STATE)
    mlflow.log_param("scaler", "StandardScaler")
    
    # Hyperparameter grid
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear'],
        'max_iter': [1000]
    }
    
    # Grid search
    lr = LogisticRegression(random_state=RANDOM_STATE)
    grid = GridSearchCV(lr, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
    grid.fit(X_train_scaled, y_train)
    
    best_model = grid.best_estimator_
    
    # Log best parameters
    for param, value in grid.best_params_.items():
        mlflow.log_param(f"best_{param}", value)
    
    # Predictions
    y_pred = best_model.predict(X_test_scaled)
    y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
    
    # Metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }
    
    # Log metrics
    for metric, value in metrics.items():
        mlflow.log_metric(metric, value)
    
    # Print results
    print("\nResults:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    
    # Confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Logistic Regression - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    cm_path = f"{OUTPUT_DIR}/lr_confusion_matrix_nb.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.show()
    
    # ROC curve
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"ROC AUC = {metrics['roc_auc']:.3f}")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Logistic Regression')
    plt.legend()
    roc_path = f"{OUTPUT_DIR}/lr_roc_curve_nb.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.show()
    
    # Log model
    signature = infer_signature(X_train_scaled, best_model.predict(X_train_scaled))
    mlflow.sklearn.log_model(best_model, "model", signature=signature)
    
    print("\n✅ Logistic Regression logged to MLflow")

## 6. Model Training with MLflow - Random Forest

In [None]:
with mlflow.start_run(run_name="Random_Forest_Notebook") as run:
    print(f"MLflow Run ID: {run.info.run_id}\n")
    
    # Log parameters
    mlflow.log_param("model_type", "Random Forest")
    mlflow.log_param("n_samples", len(df))
    mlflow.log_param("n_features", X.shape[1])
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", RANDOM_STATE)
    mlflow.log_param("scaler", "StandardScaler")
    
    # Hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }
    
    # Grid search
    rf = RandomForestClassifier(random_state=RANDOM_STATE)
    grid = GridSearchCV(rf, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
    grid.fit(X_train_scaled, y_train)
    
    best_model = grid.best_estimator_
    
    # Log best parameters
    for param, value in grid.best_params_.items():
        mlflow.log_param(f"best_{param}", value)
    
    # Predictions
    y_pred = best_model.predict(X_test_scaled)
    y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
    
    # Metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }
    
    # Log metrics
    for metric, value in metrics.items():
        mlflow.log_metric(metric, value)
    
    # Print results
    print("\nResults:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    
    # Confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
    plt.title('Random Forest - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    cm_path = f"{OUTPUT_DIR}/rf_confusion_matrix_nb.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.show()
    
    # ROC curve
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"ROC AUC = {metrics['roc_auc']:.3f}")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Random Forest')
    plt.legend()
    roc_path = f"{OUTPUT_DIR}/rf_roc_curve_nb.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.show()
    
    # Feature importance
    plt.figure(figsize=(10, 8))
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    sns.barplot(data=importance_df, x='importance', y='feature', palette='viridis')
    plt.title('Feature Importance')
    plt.tight_layout()
    fi_path = f"{OUTPUT_DIR}/rf_feature_importance_nb.png"
    plt.savefig(fi_path)
    mlflow.log_artifact(fi_path)
    plt.show()
    
    # Log model
    signature = infer_signature(X_train_scaled, best_model.predict(X_train_scaled))
    mlflow.sklearn.log_model(best_model, "model", signature=signature)
    
    print("\n✅ Random Forest logged to MLflow")

## 7. View Experiments in MLflow UI

Run the following command in terminal to view all experiments:

```bash
mlflow ui
```

Then open: http://127.0.0.1:5000

## Summary

✅ **Completed:**
- Feature engineering with StandardScaler
- Two models trained (Logistic Regression & Random Forest)
- Hyperparameter tuning with GridSearchCV
- Cross-validation
- Comprehensive metrics evaluation
- **All experiments tracked with MLflow**

**MLflow Logged:**
- Parameters (dataset info, hyperparameters)
- Metrics (accuracy, precision, recall, F1, ROC-AUC)
- Artifacts (confusion matrices, ROC curves, feature importance)
- Models (with signatures for deployment)