In [None]:
# Heart Disease UCI Dataset - Hyperparameter Tuning
# GridSearchCV and RandomizedSearchCV for Model Optimization

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, StratifiedKFold,
    validation_curve, learning_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import joblib
import time
import json
import os
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== Heart Disease Prediction - Hyperparameter Tuning ===")
print("Optimizing model performance using GridSearchCV and RandomizedSearchCV...")

# Load preprocessed data and previous results
try:
    X_scaled = pd.read_csv('../data/X_scaled.csv')
    X_top_features = pd.read_csv('../data/X_top_features.csv')
    y = pd.read_csv('../data/y.csv')['target']

    # Load train-test splits
    X_train = pd.read_csv('../data/X_train.csv')
    X_test = pd.read_csv('../data/X_test.csv')
    y_train = pd.read_csv('../data/y_train.csv')['target']
    y_test = pd.read_csv('../data/y_test.csv')['target']

    print("✅ Data loaded successfully")

except FileNotFoundError:
    print("❌ Data files not found. Please run previous notebooks first.")
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 303
    X_scaled = pd.DataFrame(
        np.random.randn(n_samples, 13),
        columns=[f'feature_{i}' for i in range(13)]
    )
    X_top_features = X_scaled.iloc[:, :8]
    y = pd.Series(np.random.choice([0, 1], n_samples), name='target')

    # Create train-test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    print("✅ Sample data created")

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Target distribution: {dict(y.value_counts())}")

# Prepare datasets for tuning
datasets = {
    'Original': {'X_train': X_train, 'X_test': X_test},
    'Top_Features': {
        'X_train': X_train[X_top_features.columns] if set(X_top_features.columns).issubset(X_train.columns) else X_train.iloc[:, :8],
        'X_test': X_test[X_top_features.columns] if set(X_top_features.columns).issubset(X_test.columns) else X_test.iloc[:, :8]
    }
}

# 1. DEFINE HYPERPARAMETER GRIDS
print("\n" + "="*70)
print("1. DEFINING HYPERPARAMETER SEARCH SPACES")
print("="*70)

# Define parameter grids for GridSearchCV
param_grids = {
    'LogisticRegression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear'],
        'max_iter': [1000]
    },
    'DecisionTree': {
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5],
        'max_features': ['sqrt', 'log2', None]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    }
}

# Define parameter distributions for RandomizedSearchCV
param_distributions = {
    'LogisticRegression': {
        'C': uniform(0.001, 100),
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    'DecisionTree': {
        'max_depth': randint(3, 21),
        'min_samples_split': randint(2, 21),
        'min_samples_leaf': randint(1, 11),
        'criterion': ['gini', 'entropy']
    },
    'RandomForest': {
        'n_estimators': randint(50, 301),
        'max_depth': randint(3, 21),
        'min_samples_split': randint(2, 21),
        'min_samples_leaf': randint(1, 11),
        'max_features': ['sqrt', 'log2', None]
    },
    'SVM': {
        'C': uniform(0.1, 100),
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto'] + list(uniform(0.001, 1).rvs(5))
    }
}

print("🔧 Parameter grids defined for:")
for model_name in param_grids.keys():
    print(f"  - {model_name}: {len(param_grids[model_name])} parameters")

# 2. GRID SEARCH CV
print("\n" + "="*70)
print("2. GRID SEARCH CROSS-VALIDATION")
print("="*70)

# Initialize models
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = 'f1'

# Store results
grid_search_results = {}

for dataset_name, data in datasets.items():
    print(f"\n📊 Grid Search on {dataset_name} dataset...")
    grid_search_results[dataset_name] = {}
    X_train_data = data['X_train']

    for model_name, model in models.items():
        print(f"  🔍 Tuning {model_name}...")
        start_time = time.time()
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            verbose=0
        )
        grid_search.fit(X_train_data, y_train)
        end_time = time.time()
        grid_search_results[dataset_name][model_name] = {
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'cv_results': grid_search.cv_results_,
            'grid_search_time': end_time - start_time
        }
        print(f"    Best score: {grid_search.best_score_:.4f}")
        print(f"    Best params: {grid_search.best_params_}")
        print(f"    Time taken: {end_time - start_time:.2f} seconds")

# 3. RANDOMIZED SEARCH CV
print("\n" + "="*70)
print("3. RANDOMIZED SEARCH CROSS-VALIDATION")
print("="*70)

random_search_results = {}
n_iter = 50

for dataset_name, data in datasets.items():
    print(f"\n🎲 Randomized Search on {dataset_name} dataset...")
    random_search_results[dataset_name] = {}
    X_train_data = data['X_train']

    for model_name, model in models.items():
        print(f"  🔍 Tuning {model_name}...")
        start_time = time.time()
        random_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_distributions[model_name],
            n_iter=n_iter,
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            verbose=0,
            random_state=42
        )
        random_search.fit(X_train_data, y_train)
        end_time = time.time()
        random_search_results[dataset_name][model_name] = {
            'best_estimator': random_search.best_estimator_,
            'best_params': random_search.best_params_,
            'best_score': random_search.best_score_,
            'cv_results': random_search.cv_results_,
            'random_search_time': end_time - start_time
        }
        print(f"    Best score: {random_search.best_score_:.4f}")
        print(f"    Best params: {random_search.best_params_}")
        print(f"    Time taken: {end_time - start_time:.2f} seconds")

# 4. COMPARE OPTIMIZATION METHODS
print("\n" + "="*70)
print("4. OPTIMIZATION METHODS COMPARISON")
print("="*70)

comparison_data = []
for dataset_name in datasets.keys():
    for model_name in models.keys():
        grid_result = grid_search_results[dataset_name][model_name]
        comparison_data.append([
            dataset_name, model_name, 'GridSearchCV',
            grid_result['best_score'],
            grid_result['grid_search_time']
        ])
        random_result = random_search_results[dataset_name][model_name]
        comparison_data.append([
            dataset_name, model_name, 'RandomizedSearchCV',
            random_result['best_score'],
            random_result['random_search_time']
        ])
comparison_df = pd.DataFrame(comparison_data,
                           columns=['Dataset', 'Model', 'Method', 'Best_Score', 'Time'])
print("📊 Optimization Methods Comparison:")
print(comparison_df.round(4))

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
score_pivot = comparison_df.pivot_table(index=['Dataset', 'Model'],
                                       columns='Method',
                                       values='Best_Score')
score_pivot.plot(kind='bar', ax=axes[0], alpha=0.8)
axes[0].set_title('Best Cross-Validation Scores')
axes[0].set_ylabel('F1 Score')
axes[0].legend(title='Method')
axes[0].grid(True, alpha=0.3)
plt.setp(axes[0].get_xticklabels(), rotation=45, ha='right')

time_pivot = comparison_df.pivot_table(index=['Dataset', 'Model'],
                                      columns='Method',
                                      values='Time')
time_pivot.plot(kind='bar', ax=axes[1], alpha=0.8)
axes[1].set_title('Optimization Time Comparison')
axes[1].set_ylabel('Time (seconds)')
axes[1].legend(title='Method')
axes[1].grid(True, alpha=0.3)
plt.setp(axes[1].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

# 5. EVALUATE BEST MODELS ON TEST SET
print("\n" + "="*70)
print("5. TEST SET EVALUATION")
print("="*70)

test_results = {}
for dataset_name, data in datasets.items():
    print(f"\n📊 Test Set Evaluation - {dataset_name} Dataset:")
    test_results[dataset_name] = {}
    X_test_data = data['X_test']

    for model_name in models.keys():
        best_model = grid_search_results[dataset_name][model_name]['best_estimator']
        y_pred = best_model.predict(X_test_data)
        y_pred_proba = best_model.predict_proba(X_test_data)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        test_results[dataset_name][model_name] = {
            'accuracy': accuracy,
            'f1_score': f1,
            'roc_auc': roc_auc,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }
        print(f"  {model_name}:")
        print(f"    Accuracy: {accuracy:.4f}")
        print(f"    F1 Score: {f1:.4f}")
        print(f"    ROC AUC: {roc_auc:.4f}")

# 6. LEARNING CURVES
print("\n" + "="*70)
print("6. LEARNING CURVES ANALYSIS")
print("="*70)

def plot_learning_curves(estimator, X, y, title, cv=None):
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='f1', random_state=42
    )
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation score')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
    plt.xlabel('Training Set Size')
    plt.ylabel('F1 Score')
    plt.title(f'Learning Curves - {title}')
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

print("📈 Generating learning curves for best models...")
for dataset_name, data in datasets.items():
    X_train_data = data['X_train']
    best_f1 = 0
    best_model_name = None
    for model_name in models.keys():
        f1 = grid_search_results[dataset_name][model_name]['best_score']
        if f1 > best_f1:
            best_f1 = f1
            best_model_name = model_name
    if best_model_name:
        best_estimator = grid_search_results[dataset_name][best_model_name]['best_estimator']
        plot_learning_curves(best_estimator, X_train_data, y_train,
                           f'{best_model_name} - {dataset_name}', cv=cv)

# 7. VALIDATION CURVES
print("\n" + "="*70)
print("7. VALIDATION CURVES ANALYSIS")
print("="*70)

# ===================================================================
# START OF THE CORRECTED CODE BLOCK
# ===================================================================
def plot_validation_curve(estimator, X, y, param_name, param_range, title):
    """Plot validation curve for a specific parameter"""
    # THE FIX IS HERE: using keyword arguments for param_name and param_range
    train_scores, val_scores = validation_curve(
        estimator, X, y,
        param_name=param_name,
        param_range=param_range,
        cv=cv, scoring='f1', n_jobs=-1
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(8, 6))
    plt.plot(param_range, train_mean, 'o-', color='blue', label='Training score')
    plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.plot(param_range, val_mean, 'o-', color='red', label='Validation score')
    plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')

    plt.xlabel(param_name)
    plt.ylabel('F1 Score')
    plt.title(f'Validation Curve - {title}')
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
# ===================================================================
# END OF THE CORRECTED CODE BLOCK
# ===================================================================

print("📊 Generating validation curves for key parameters...")

rf_model = RandomForestClassifier(random_state=42)
n_estimators_range = [10, 50, 100, 150, 200, 300]
plot_validation_curve(rf_model, X_train, y_train, 'n_estimators',
                     n_estimators_range, 'Random Forest n_estimators')

svm_model = SVC(random_state=42)
c_range = [0.1, 1, 10, 100, 1000]
plot_validation_curve(svm_model, X_train, y_train, 'C',
                     c_range, 'SVM C parameter')

# 8. FEATURE IMPORTANCE FROM BEST MODELS
print("\n" + "="*70)
print("8. FEATURE IMPORTANCE FROM OPTIMIZED MODELS")
print("="*70)

for dataset_name, data in datasets.items():
    print(f"\n📊 Feature Importance - {dataset_name} Dataset:")
    tree_models = ['DecisionTree', 'RandomForest']
    fig, axes = plt.subplots(1, len(tree_models), figsize=(12, 5))
    if len(tree_models) == 1:
        axes = [axes]

    for i, model_name in enumerate(tree_models):
        best_model = grid_search_results[dataset_name][model_name]['best_estimator']
        feature_names = data['X_train'].columns
        importances = best_model.feature_importances_
        indices = np.argsort(importances)[::-1]

        axes[i].barh(range(len(importances)), importances[indices], alpha=0.7)
        axes[i].set_yticks(range(len(importances)))
        axes[i].set_yticklabels([feature_names[idx] for idx in indices])
        axes[i].set_xlabel('Importance Score')
        axes[i].set_title(f'Optimized {model_name}')
        axes[i].grid(True, alpha=0.3)

        print(f"  {model_name} - Top 5 features:")
        for j in range(min(5, len(indices))):
            idx = indices[j]
            print(f"    {j+1}. {feature_names[idx]}: {importances[idx]:.4f}")

    plt.suptitle(f'Feature Importance - {dataset_name} Dataset', fontsize=14)
    plt.tight_layout()
    plt.show()

# 9. MODEL PERFORMANCE SUMMARY
print("\n" + "="*70)
print("9. FINAL MODEL PERFORMANCE SUMMARY")
print("="*70)

final_results_data = []
for dataset_name in datasets.keys():
    for model_name in models.keys():
        cv_score = grid_search_results[dataset_name][model_name]['best_score']
        test_acc = test_results[dataset_name][model_name]['accuracy']
        test_f1 = test_results[dataset_name][model_name]['f1_score']
        test_auc = test_results[dataset_name][model_name]['roc_auc']
        final_results_data.append([
            dataset_name, model_name, cv_score, test_acc, test_f1, test_auc
        ])
final_results_df = pd.DataFrame(final_results_data,
                               columns=['Dataset', 'Model', 'CV_F1', 'Test_Accuracy', 'Test_F1', 'Test_AUC'])
print("🏆 Final Model Performance Summary:")
print(final_results_df.round(4))

best_model_idx = final_results_df['Test_F1'].idxmax()
best_model_info = final_results_df.loc[best_model_idx]

print(f"\n🥇 Overall Best Model:")
print(f"  Dataset: {best_model_info['Dataset']}")
print(f"  Model: {best_model_info['Model']}")
print(f"  Test F1 Score: {best_model_info['Test_F1']:.4f}")
print(f"  Test Accuracy: {best_model_info['Test_Accuracy']:.4f}")
print(f"  Test AUC: {best_model_info['Test_AUC']:.4f}")

# 10. DETAILED CLASSIFICATION REPORTS
print("\n" + "="*70)
print("10. DETAILED CLASSIFICATION REPORTS")
print("="*70)

for dataset_name in datasets.keys():
    print(f"\n📋 Classification Reports - {dataset_name} Dataset:")
    for model_name in models.keys():
        y_pred = test_results[dataset_name][model_name]['y_pred']
        print(f"\n{model_name} (Optimized):")
        print(classification_report(y_test, y_pred,
                                  target_names=['No Disease', 'Disease']))

# 11. SAVE FINAL OPTIMIZED MODELS
print("\n" + "="*70)
print("11. SAVING OPTIMIZED MODELS")
print("="*70)

try:
    os.makedirs('../models', exist_ok=True)
    os.makedirs('../results', exist_ok=True)
    best_models_info = {}
    for dataset_name in datasets.keys():
        dataset_results = final_results_df[final_results_df['Dataset'] == dataset_name]
        best_idx = dataset_results['Test_F1'].idxmax()
        best_model_row = final_results_df.loc[best_idx]
        best_model_name = best_model_row['Model']
        best_model = grid_search_results[dataset_name][best_model_name]['best_estimator']
        best_params = grid_search_results[dataset_name][best_model_name]['best_params']
        model_filename = f'final_optimized_model_{dataset_name.lower()}.pkl'
        joblib.dump(best_model, f'../models/{model_filename}')

        best_models_info[dataset_name] = {
            'model_name': best_model_name,
            'best_params': best_params,
            'cv_f1_score': float(best_model_row['CV_F1']),
            'test_accuracy': float(best_model_row['Test_Accuracy']),
            'test_f1_score': float(best_model_row['Test_F1']),
            'test_auc': float(best_model_row['Test_AUC']),
            'filename': model_filename,
            'features': list(datasets[dataset_name]['X_train'].columns)
        }
        print(f"✅ Saved optimized {best_model_name} for {dataset_name}")

    final_results_df.to_csv('results/hyperparameter_tuning_results.csv', index=False)
    comparison_df.to_csv('results/optimization_methods_comparison.csv', index=False)

    with open('../results/final_optimized_models_info.json', 'w') as f:
        json.dump(best_models_info, f, indent=2)

    hyperparameter_results = {}
    for dataset_name in datasets.keys():
        hyperparameter_results[dataset_name] = {}
        for model_name in models.keys():
            hyperparameter_results[dataset_name][model_name] = {
                'best_params': grid_search_results[dataset_name][model_name]['best_params'],
                'best_cv_score': float(grid_search_results[dataset_name][model_name]['best_score'])
            }
    with open('../results/best_hyperparameters.json', 'w') as f:
        json.dump(hyperparameter_results, f, indent=2)

    print(f"\n✅ All results saved successfully!")

except Exception as e:
    print(f"⚠️ Error saving files: {e}")

# 12. HYPERPARAMETER TUNING SUMMARY
print("\n" + "="*70)
print("12. HYPERPARAMETER TUNING SUMMARY")
print("="*70)

print("✅ Hyperparameter tuning completed successfully!")
print(f"📊 Models optimized: {len(models)}")
print(f"📊 Datasets used: {len(datasets)}")
print(f"📊 Optimization methods: GridSearchCV, RandomizedSearchCV")

print(f"\n🏆 Best optimized models:")
for dataset_name, info in best_models_info.items():
    print(f"  {dataset_name}: {info['model_name']}")
    print(f"    Test F1: {info['test_f1_score']:.4f}")
    print(f"    Test Accuracy: {info['test_accuracy']:.4f}")
    print(f"    Best params: {info['best_params']}")

print(f"\n🎯 Project completion status:")
print("  1. ✅ Data preprocessing complete")
print("  2. ✅ PCA analysis complete")
print("  3. ✅ Feature selection complete")
print("  4. ✅ Supervised learning complete")
print("  5. ✅ Unsupervised learning complete")
print("  6. ✅ Hyperparameter tuning complete")

print(f"\n🎉 Machine Learning Pipeline Complete!")