In [None]:
# Heart Disease UCI Dataset - Supervised Learning Models
# Training and Evaluation of Classification Models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== Heart Disease Prediction - Supervised Learning Models ===")
print("Training and evaluating classification models...")

# Load preprocessed data
try:
    # Load original scaled data
    X_scaled = pd.read_csv('../data/X_scaled.csv')
    y = pd.read_csv('../data/y.csv')['target']

    # Load feature-selected data
    X_top_features = pd.read_csv('../data/X_top_features.csv')
    X_rfecv_features = pd.read_csv('../data/X_rfecv_features.csv')

    print("✅ Data loaded successfully")
    print(f"Original features: {X_scaled.shape[1]}")
    print(f"Top selected features: {X_top_features.shape[1]}")
    print(f"RFECV features: {X_rfecv_features.shape[1]}")

except FileNotFoundError:
    print("❌ Data files not found. Please run previous notebooks first.")
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 303
    X_scaled = pd.DataFrame(
        np.random.randn(n_samples, 13),
        columns=[f'feature_{i}' for i in range(13)]
    )
    X_top_features = X_scaled.iloc[:, :8]  # Top 8 features
    X_rfecv_features = X_scaled.iloc[:, :6]  # RFECV features
    y = pd.Series(np.random.choice([0, 1], n_samples), name='target')
    print("✅ Sample data created")

print(f"\nTarget distribution: {dict(y.value_counts())}")
print(f"Class balance: {dict(y.value_counts(normalize=True).round(3))}")

# 1. TRAIN-TEST SPLIT
print("\n" + "="*70)
print("1. DATA SPLITTING")
print("="*70)

# Split datasets
datasets = {
    'Original': X_scaled,
    'Top_Features': X_top_features,
    'RFECV': X_rfecv_features
}

splits = {}
for name, X in datasets.items():
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    splits[name] = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }
    print(f"{name} dataset:")
    print(f"  Train: {X_train.shape}, Test: {X_test.shape}")

# 2. INITIALIZE MODELS
print("\n" + "="*70)
print("2. MODEL INITIALIZATION")
print("="*70)

models = {
    'Logistic_Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision_Tree': DecisionTreeClassifier(random_state=42),
    'Random_Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

print("🤖 Initialized models:")
for name, model in models.items():
    print(f"  - {name}: {model.__class__.__name__}")

# 3. TRAIN MODELS AND EVALUATE
print("\n" + "="*70)
print("3. MODEL TRAINING AND EVALUATION")
print("="*70)

results = {}
trained_models = {}

for dataset_name, data in splits.items():
    print(f"\n📊 Training on {dataset_name} dataset...")
    results[dataset_name] = {}
    trained_models[dataset_name] = {}

    X_train, X_test = data['X_train'], data['X_test']
    y_train, y_test = data['y_train'], data['y_test']

    for model_name, model in models.items():
        print(f"  🔧 Training {model_name}...")

        # Train model
        model_copy = type(model)(**model.get_params())
        model_copy.fit(X_train, y_train)

        # Predictions
        y_pred = model_copy.predict(X_test)
        y_pred_proba = model_copy.predict_proba(X_test)[:, 1]

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        # Cross-validation
        cv_scores = cross_val_score(model_copy, X_train, y_train,
                                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                                   scoring='accuracy')

        # Store results
        results[dataset_name][model_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba,
            'y_test': y_test
        }

        trained_models[dataset_name][model_name] = model_copy

        print(f"    Accuracy: {accuracy:.3f}, F1: {f1:.3f}, ROC-AUC: {roc_auc:.3f}")

# 4. RESULTS COMPARISON
print("\n" + "="*70)
print("4. MODEL PERFORMANCE COMPARISON")
print("="*70)

# Create comparison dataframes
comparison_metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'cv_mean']
comparison_data = []

for dataset_name in results.keys():
    for model_name in results[dataset_name].keys():
        row = [dataset_name, model_name]
        for metric in comparison_metrics:
            row.append(results[dataset_name][model_name][metric])
        comparison_data.append(row)

columns = ['Dataset', 'Model'] + [m.replace('_', ' ').title() for m in comparison_metrics]
results_df = pd.DataFrame(comparison_data, columns=columns)

print("📊 Model Performance Summary:")
print(results_df.round(4))

# Find best model for each dataset
print(f"\n🏆 Best models by dataset:")
for dataset_name in results.keys():
    dataset_results = results_df[results_df['Dataset'] == dataset_name]
    best_model = dataset_results.loc[dataset_results['F1 Score'].idxmax()]
    print(f"  {dataset_name}: {best_model['Model']} (F1: {best_model['F1 Score']:.3f})")

# 5. VISUALIZE RESULTS
print("\n" + "="*70)
print("5. RESULTS VISUALIZATION")
print("="*70)

# Performance comparison plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Performance Comparison Across Datasets', fontsize=16, fontweight='bold')

metrics_to_plot = ['Accuracy', 'F1 Score', 'Roc Auc', 'Cv Mean']
for i, metric in enumerate(metrics_to_plot):
    ax = axes[i//2, i%2]

    # Pivot for easier plotting
    pivot_df = results_df.pivot(index='Model', columns='Dataset', values=metric)
    pivot_df.plot(kind='bar', ax=ax, alpha=0.8)
    ax.set_title(f'{metric} by Model and Dataset')
    ax.set_ylabel(metric)
    ax.set_xlabel('Model')
    ax.legend(title='Dataset', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

# 6. CONFUSION MATRICES
print("\n" + "="*70)
print("6. CONFUSION MATRICES")
print("="*70)

# Plot confusion matrices for best performing models
n_datasets = len(datasets)
fig, axes = plt.subplots(n_datasets, len(models), figsize=(20, 5*n_datasets))
if n_datasets == 1:
    axes = axes.reshape(1, -1)

for i, dataset_name in enumerate(results.keys()):
    for j, model_name in enumerate(models.keys()):
        ax = axes[i, j] if n_datasets > 1 else axes[j]

        y_test = results[dataset_name][model_name]['y_test']
        y_pred = results[dataset_name][model_name]['y_pred']

        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                   xticklabels=['No Disease', 'Disease'],
                   yticklabels=['No Disease', 'Disease'])
        ax.set_title(f'{model_name}\n{dataset_name} Dataset')
        ax.set_ylabel('True Label')
        ax.set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

# 7. ROC CURVES
print("\n" + "="*70)
print("7. ROC CURVES")
print("="*70)

# Plot ROC curves
fig, axes = plt.subplots(1, len(datasets), figsize=(6*len(datasets), 5))
if len(datasets) == 1:
    axes = [axes]

colors = plt.cm.Set1(np.linspace(0, 1, len(models)))

for i, dataset_name in enumerate(datasets.keys()):
    ax = axes[i]

    for j, model_name in enumerate(models.keys()):
        y_test = results[dataset_name][model_name]['y_test']
        y_pred_proba = results[dataset_name][model_name]['y_pred_proba']
        roc_auc = results[dataset_name][model_name]['roc_auc']

        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        ax.plot(fpr, tpr, color=colors[j], linewidth=2,
               label=f'{model_name} (AUC = {roc_auc:.3f})')

    ax.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random Classifier')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'ROC Curves - {dataset_name} Dataset')
    ax.legend(loc="lower right")
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 8. PRECISION-RECALL CURVES
print("\n" + "="*70)
print("8. PRECISION-RECALL CURVES")
print("="*70)

fig, axes = plt.subplots(1, len(datasets), figsize=(6*len(datasets), 5))
if len(datasets) == 1:
    axes = [axes]

for i, dataset_name in enumerate(datasets.keys()):
    ax = axes[i]

    for j, model_name in enumerate(models.keys()):
        y_test = results[dataset_name][model_name]['y_test']
        y_pred_proba = results[dataset_name][model_name]['y_pred_proba']

        precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
        avg_precision = average_precision_score(y_test, y_pred_proba)

        ax.plot(recall_vals, precision_vals, color=colors[j], linewidth=2,
               label=f'{model_name} (AP = {avg_precision:.3f})')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title(f'Precision-Recall Curves - {dataset_name} Dataset')
    ax.legend(loc="lower left")
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 9. FEATURE IMPORTANCE (for tree-based models)
print("\n" + "="*70)
print("9. FEATURE IMPORTANCE ANALYSIS")
print("="*70)

tree_models = ['Decision_Tree', 'Random_Forest']
for dataset_name in datasets.keys():
    print(f"\n📊 Feature Importance - {dataset_name} Dataset:")

    fig, axes = plt.subplots(1, len(tree_models), figsize=(12, 5))
    if len(tree_models) == 1:
        axes = [axes]

    for i, model_name in enumerate(tree_models):
        if model_name in trained_models[dataset_name]:
            model = trained_models[dataset_name][model_name]
            feature_names = datasets[dataset_name].columns
            importances = model.feature_importances_

            # Sort features by importance
            indices = np.argsort(importances)[::-1]

            ax = axes[i]
            ax.barh(range(len(importances)), importances[indices], alpha=0.7)
            ax.set_yticks(range(len(importances)))
            ax.set_yticklabels([feature_names[idx] for idx in indices])
            ax.set_xlabel('Importance Score')
            ax.set_title(f'{model_name} Feature Importance')
            ax.grid(True, alpha=0.3)

            # Print top features
            print(f"  {model_name} - Top 5 features:")
            for j in range(min(5, len(indices))):
                idx = indices[j]
                print(f"    {j+1}. {feature_names[idx]}: {importances[idx]:.3f}")

    plt.tight_layout()
    plt.show()

# 10. DETAILED CLASSIFICATION REPORTS
print("\n" + "="*70)
print("10. DETAILED CLASSIFICATION REPORTS")
print("="*70)

for dataset_name in results.keys():
    print(f"\n📋 Classification Reports - {dataset_name} Dataset:")

    for model_name in models.keys():
        y_test = results[dataset_name][model_name]['y_test']
        y_pred = results[dataset_name][model_name]['y_pred']

        print(f"\n{model_name}:")
        print(classification_report(y_test, y_pred,
                                  target_names=['No Disease', 'Disease']))

# 11. SAVE BEST MODELS
print("\n" + "="*70)
print("11. SAVING MODELS")
print("="*70)

try:
    import os
    os.makedirs('../models', exist_ok=True)

    # Find and save best model for each dataset
    best_models_info = {}

    for dataset_name in results.keys():
        # Find best model based on F1 score
        best_f1 = 0
        best_model_name = None

        for model_name in models.keys():
            f1 = results[dataset_name][model_name]['f1_score']
            if f1 > best_f1:
                best_f1 = f1
                best_model_name = model_name

        if best_model_name:
            best_model = trained_models[dataset_name][best_model_name]
            model_filename = f'best_model_{dataset_name.lower()}.pkl'
            joblib.dump(best_model, f'../models/{model_filename}')

            best_models_info[dataset_name] = {
                'model_name': best_model_name,
                'f1_score': best_f1,
                'filename': model_filename,
                'features': datasets[dataset_name].columns.tolist()
            }

            print(f"✅ Saved best model for {dataset_name}: {best_model_name} (F1: {best_f1:.3f})")

    # Save all results
    results_df.to_csv('results/supervised_learning_results.csv', index=False)

    # Save best models info
    import json
    with open('../results/best_models_info.json', 'w') as f:
        json.dump(best_models_info, f, indent=2)

    print(f"\n✅ Results saved successfully!")
    print("Files saved:")
    print("  - Best models: models/best_model_*.pkl")
    print("  - Results: supervised_learning_results.csv")
    print("  - Model info: best_models_info.json")

except Exception as e:
    print(f"⚠️ Error saving files: {e}")

# 12. MODEL COMPARISON SUMMARY
print("\n" + "="*70)
print("12. SUPERVISED LEARNING SUMMARY")
print("="*70)

print("✅ Supervised learning analysis completed successfully!")
print(f"📊 Models trained: {len(models)}")
print(f"📊 Datasets used: {len(datasets)}")
print(f"📊 Total experiments: {len(models) * len(datasets)}")

print(f"\n🏆 Best performing models:")
for dataset_name, info in best_models_info.items():
    print(f"  {dataset_name}: {info['model_name']} (F1: {info['f1_score']:.3f})")

print(f"\n📈 Overall performance insights:")
# Calculate average performance across datasets
overall_performance = results_df.groupby('Model').agg({
    'Accuracy': 'mean',
    'F1 Score': 'mean',
    'Roc Auc': 'mean'
}).sort_values('F1 Score', ascending=False)

print("Average performance across all datasets:")
print(overall_performance.round(3))

print(f"\n💡 Key findings:")
print("  - Model performance varies across feature sets")
print("  - Feature selection can improve or maintain performance")
print("  - Cross-validation provides robust performance estimates")
print("  - Tree-based models provide feature importance insights")

print(f"\n🎯 Next steps:")
print("  1. ✅ Data preprocessing complete")
print("  2. ✅ PCA analysis complete")
print("  3. ✅ Feature selection complete")
print("  4. ✅ Supervised learning complete")
print("  5. ⏳ Apply unsupervised learning (05_unsupervised_learning.ipynb)")
print("  6. ⏳ Hyperparameter tuning (06_hyperparameter_tuning.ipynb)")

print(f"\n🎉 Ready to proceed to unsupervised learning!")
