In [1]:
# Heart Disease UCI Dataset - Hyperparameter Tuning
# Notebook 06: Model Optimization & Final Model Selection

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer
import joblib
import warnings

warnings.filterwarnings('ignore')

# Load selected features data
df = pd.read_csv('../data/processed/selected_features.csv')

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print("Dataset shape:", X.shape)
print("Target value counts:")
print(y.value_counts().sort_index())
print(f"Number of unique target values: {y.nunique()}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale features for algorithms that need scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Baseline results for comparison
try:
    baseline_results = pd.read_csv('../models/model_results.csv', index_col=0)
    print("\nBaseline Results:")
    print(baseline_results[['Accuracy', 'F1-Score']].round(3))
except FileNotFoundError:
    print("\nBaseline results file not found. Will create comparison after tuning.")
    baseline_results = None

    param_grids = {
        'Logistic Regression': {
            'C': [0.1, 1, 10, 100], # Lower = more regularization, higher = less regularization
            'solver': ['liblinear', 'lbfgs'],
            'max_iter': [1000]
        },
        'Decision Tree': {
            'max_depth': [3, 5, 10, 15, None], # Controls overfitting (None = unlimited depth)
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        },
        'Random Forest': {
            'n_estimators': [50, 100, 200], #  # Number of trees
            'max_depth': [3, 5, 10, None],
            'min_samples_split': [2, 5, 10]
        },
        'SVM': {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    }

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

# Create F1 scorer for binary classification
f1_scorer = make_scorer(f1_score)

# Hyperparameter tuning results
tuning_results = {}
best_models = {}

print("\n" + "=" * 50)
print("HYPERPARAMETER TUNING")
print("=" * 50)

for name, model in models.items():
    print(f"\nTuning {name}...")

    # Determine if we need scaled data
    use_scaled_data = name in ['Logistic Regression', 'SVM']
    X_train_use = X_train_scaled if use_scaled_data else X_train
    X_test_use = X_test_scaled if use_scaled_data else X_test

    try:
        # Use GridSearchCV with F1 scorer
        grid_search = GridSearchCV(
            model,
            param_grids[name],
            cv=5,
            scoring=f1_scorer,
            n_jobs=-1,
            verbose=0
        )

        grid_search.fit(X_train_use, y_train)

        # Best parameters
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV F1-score: {grid_search.best_score_:.3f}")

        # Test on test set
        y_pred = grid_search.predict(X_test_use)
        test_accuracy = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)

        print(f"Test Accuracy: {test_accuracy:.3f}")
        print(f"Test F1-score: {test_f1:.3f}")

        # Results
        tuning_results[name] = {
            'Best_Params': grid_search.best_params_,
            'CV_F1': grid_search.best_score_,
            'Test_Accuracy': test_accuracy,
            'Test_F1': test_f1,
            'Uses_Scaled_Data': use_scaled_data
        }

        best_models[name] = {
            'model': grid_search.best_estimator_,
            'scaler': scaler if use_scaled_data else None
        }

    except Exception as e:
        print(f"Error tuning {name}: {str(e)}")
        print("Skipping this model...")
        continue

# Create comparison DataFrame
print("\n" + "=" * 50)
print("OPTIMIZED MODEL RESULTS")
print("=" * 50)

if tuning_results:
    # Create results DataFrame
    results_df = pd.DataFrame({
        'Model': tuning_results.keys(),
        'CV_F1': [tuning_results[name]['CV_F1'] for name in tuning_results.keys()],
        'Test_Accuracy': [tuning_results[name]['Test_Accuracy'] for name in tuning_results.keys()],
        'Test_F1': [tuning_results[name]['Test_F1'] for name in tuning_results.keys()]
    }).set_index('Model')

    print(results_df.round(3))

    # Compare with baseline if available
    if baseline_results is not None:
        print("\n" + "=" * 50)
        print("BASELINE vs OPTIMIZED COMPARISON")
        print("=" * 50)

        # Create comparison for models that exist in both
        common_models = set(baseline_results.index) & set(tuning_results.keys())

        if common_models:
            comparison_data = {}
            for model_name in common_models:
                comparison_data[model_name] = {
                    'Baseline_Accuracy': baseline_results.loc[model_name, 'Accuracy'],
                    'Optimized_Accuracy': tuning_results[model_name]['Test_Accuracy'],
                    'Baseline_F1': baseline_results.loc[model_name, 'F1-Score'],
                    'Optimized_F1': tuning_results[model_name]['Test_F1'],
                }

            comparison_df = pd.DataFrame(comparison_data).T
            comparison_df['Accuracy_Improvement'] = comparison_df['Optimized_Accuracy'] - comparison_df[
                'Baseline_Accuracy']
            comparison_df['F1_Improvement'] = comparison_df['Optimized_F1'] - comparison_df['Baseline_F1']

            print(comparison_df.round(3))
        else:
            print("No common models found between baseline and optimized results.")

    # Find best optimized model
    # Selects model with the highest test F1-score (most reliable)
    best_model_name = max(tuning_results.keys(), key=lambda x: tuning_results[x]['Test_F1'])
    best_optimized_model_info = best_models[best_model_name]

    print(f"\n" + "=" * 40)
    print("BEST OPTIMIZED MODEL")
    print("=" * 40)
    print(f"Model: {best_model_name}")
    print(f"Parameters: {tuning_results[best_model_name]['Best_Params']}")
    print(f"CV F1-score: {tuning_results[best_model_name]['CV_F1']:.3f}")
    print(f"Test F1-score: {tuning_results[best_model_name]['Test_F1']:.3f}")
    print(f"Test Accuracy: {tuning_results[best_model_name]['Test_Accuracy']:.3f}")

    model_for_prediction = best_optimized_model_info['model']
    scaler_for_prediction = best_optimized_model_info['scaler']

    X_test_final = X_test_scaled if scaler_for_prediction else X_test
    y_pred_best = model_for_prediction.predict(X_test_final)

    print(f"\nClassification Report for {best_model_name}:")
    print(classification_report(y_test, y_pred_best))

    # Feature importance for tree-based models
    if best_model_name in ['Decision Tree', 'Random Forest']:
        try:
            feature_importance = pd.DataFrame({
                'feature': X.columns,
                'importance': model_for_prediction.feature_importances_
            }).sort_values('importance', ascending=False)

            print(f"\nTop 10 Feature Importances ({best_model_name}):")
            print(feature_importance.head(10).round(4))
        except:
            print("Could not extract feature importances.")

    # Save the best optimized model
    best_model_package = {
        'model': model_for_prediction,
        'scaler': scaler_for_prediction,
        'feature_names': X.columns.tolist(),
        'model_name': best_model_name,
        'parameters': tuning_results[best_model_name]['Best_Params'],
        'performance': {
            'cv_f1': tuning_results[best_model_name]['CV_F1'],
            'test_f1': tuning_results[best_model_name]['Test_F1'],
            'test_accuracy': tuning_results[best_model_name]['Test_Accuracy']
        }
    }

    joblib.dump(best_model_package, '../models/best_optimized_model.pkl')
    print(f"\nBest optimized model package saved to ../models/best_optimized_model.pkl")

    # Save tuning results
    tuning_results_df = pd.DataFrame({
        k: {
            'CV_F1': v['CV_F1'],
            'Test_Accuracy': v['Test_Accuracy'],
            'Test_F1': v['Test_F1'],
            'Best_Params': str(v['Best_Params'])
        }
        for k, v in tuning_results.items()
    }).T

    tuning_results_df.to_csv('../models/tuning_results.csv')
    print("Tuning results saved to ../models/tuning_results.csv")

    # Save comparison results if baseline exists
    if baseline_results is not None and 'comparison_df' in locals():
        comparison_df.to_csv('../models/model_comparison.csv')
        print("Model comparison saved to ../models/model_comparison.csv")

    print("\nHyperparameter tuning completed successfully!")

    print(f"\n" + "=" * 40)
    print("SUMMARY")
    print("=" * 40)
    print(f"Number of models tuned: {len(tuning_results)}")
    print(f"Best model: {best_model_name}")
    print(f"Best F1-score: {tuning_results[best_model_name]['Test_F1']:.3f}")
    if baseline_results is not None and best_model_name in baseline_results.index:
        improvement = tuning_results[best_model_name]['Test_F1'] - baseline_results.loc[best_model_name, 'F1-Score']
        print(f"F1-score improvement: {improvement:+.3f}")

else:
    print("No models were successfully tuned.")

Dataset shape: (297, 8)
Target value counts:
target
0    160
1    137
Name: count, dtype: int64
Number of unique target values: 2
Classification type: Binary
F1-score averaging method: binary

Training set: (237, 8)
Test set: (60, 8)

Baseline Results:
                     Accuracy  F1-Score
Logistic Regression     0.850     0.830
Decision Tree           0.733     0.704
Random Forest           0.817     0.784
SVM                     0.833     0.808

HYPERPARAMETER TUNING

Tuning Logistic Regression...
Best parameters: {'C': 0.1, 'max_iter': 1000, 'solver': 'liblinear'}
Best CV F1-score: 0.802
Test Accuracy: 0.867
Test F1-score: 0.852

Tuning Decision Tree...
Best parameters: {'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best CV F1-score: 0.781
Test Accuracy: 0.817
Test F1-score: 0.800

Tuning Random Forest...
Best parameters: {'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 100}
Best CV F1-score: 0.797
Test Accuracy: 0.833
Test F1-score: 0.808

Tuning SVM...
