In [None]:
# https://archive.ics.uci.edu/dataset/45/heart+disease

In [None]:

import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_csv("../data/heart_attack_prediction_dataset.csv")

In [None]:
print("Shape of dataset:", df.shape)
print("First 15 rows:")
display(df.head(15))

In [None]:
print(df.info())

In [None]:
print(df.isnull().sum())

In [None]:
# Data Preprocessing & Feature Engineering

# Count missing values in the problematic features
missing_analysis = pd.DataFrame({
    'feature': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca',
                'thal'],
    'total_missing': [
        (df['age'] == '?').sum(),
        (df['sex'] == '?').sum(),
        (df['cp'] == '?').sum(),
        (df['trestbps'] == '?').sum(),
        (df['chol'] == '?').sum(),
        (df['fbs'] == '?').sum(),
        (df['restecg'] == '?').sum(),
        (df['thalach'] == '?').sum(),
        (df['exang'] == '?').sum(),
        (df['oldpeak'] == '?').sum(),
        (df['slope'] == '?').sum(),
        (df['ca'] == '?').sum(),
        (df['thal'] == '?').sum(),
    ],
    'missing_percentage': [
        ((df['age'] == '?').sum() / len(df)) * 100,
        ((df['sex'] == '?').sum() / len(df)) * 100,
        ((df['cp'] == '?').sum() / len(df)) * 100,
        ((df['trestbps'] == '?').sum() / len(df)) * 100,
        ((df['chol'] == '?').sum() / len(df)) * 100,
        ((df['fbs'] == '?').sum() / len(df)) * 100,
        ((df['restecg'] == '?').sum() / len(df)) * 100,
        ((df['thalach'] == '?').sum() / len(df)) * 100,
        ((df['exang'] == '?').sum() / len(df)) * 100,
        ((df['oldpeak'] == '?').sum() / len(df)) * 100,
        ((df['slope'] == '?').sum() / len(df)) * 100,
        ((df['thal'] == '?').sum() / len(df)) * 100,
        ((df['ca'] == '?').sum() / len(df)) * 100
    ]
})

print("Missing Value Analysis:")
print(missing_analysis)

In [None]:
from sklearn.impute import KNNImputer


def clean_heart_disease_data(df):
    """
    Comprehensive cleaning function for UCI Heart Disease dataset
    """
    # Make a copy to avoid modifying original
    df_clean = df.copy()

    # Step 1: Drop high-missing columns
    columns_to_drop = ['slope', 'thal', 'ca']
    df_clean = df_clean.drop(columns=columns_to_drop, errors='ignore')

    # Step 2: Clean column names
    df_clean.columns = df_clean.columns.str.strip()

    # Step 3: Replace '?' with NaN for proper handling
    df_clean = df_clean.replace('?', np.nan)

    # Step 4: Convert numerical columns to proper numeric type
    numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    for col in numerical_cols:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    # Step 5: Convert categorical columns (handle any remaining '?')
    categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang']
    for col in categorical_cols:
        if col in df_clean.columns:
            # Replace any non-numeric values with NaN, then convert to numeric
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    # Step 6: Impute missing values using KNNImputer
    imputer = KNNImputer(n_neighbors=5)
    df_clean = pd.DataFrame(imputer.fit_transform(df_clean), columns=df_clean.columns)

    # Step 7: Drop rows with any remaining missing values
    initial_rows = len(df_clean)
    df_clean = df_clean.dropna()
    final_rows = len(df_clean)

    print(f"  Data cleaning completed:")
    print(f"  Rows removed due to missing values: {initial_rows - final_rows}")
    print(f"  Final dataset shape: {df_clean.shape}")

    # Step 8: Create binary target
    df_clean['heart_disease'] = (df_clean['num'] > 0).astype(int)

    return df_clean


# Apply the comprehensive cleaning
df_clean = clean_heart_disease_data(df)

# Verify no more '?' values
print(" Verification - No more '?' values:")
for col in df_clean.columns:
    if (df_clean[col] == '?').any():
        print(f"  WARNING: Still found '?' in {col}")
    else:
        print(f"  {col}: Clean")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Create binary target (using the clean 'num' column)
df_clean['heart_disease'] = (df_clean['num'] > 0).astype(int)

print("  Target Distribution:")
print(df_clean['heart_disease'].value_counts())
print(f"Baseline accuracy: {max(df_clean['heart_disease'].value_counts(normalize=True)):.3f}")

# Separate features and target
X = df_clean.drop(['num', 'heart_disease'], axis=1)
y = df_clean['heart_disease']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f" Dataset Split:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale numerical features
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()

# Scale the numerical features
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("  Feature scaling completed successfully!")

In [None]:
# Import all required models and utilities
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, \
    ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=13, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=13, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=13),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=13),
    'SVM': SVC(kernel='linear', random_state=13, class_weight='balanced', probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=13, class_weight='balanced')
}

In [None]:
# For models that need scaling, create pipelines
scaling_models = ['Logistic Regression', 'SVM', 'K-Nearest Neighbors']
model_pipelines = {}

for name, model in models.items():
    if name in scaling_models:
        # Create pipeline with scaling for models that need it
        model_pipelines[name] = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])
    else:
        # Tree-based models don't need scaling
        model_pipelines[name] = model

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
scoring = make_scorer(accuracy_score)

# Perform cross-validation for each model
cv_results = {}
print(" Running 5-Fold Cross-Validation...")
for name, model in model_pipelines.items():
    try:
        cv_scores = cross_val_score(model, X_train, y_train,
                                    cv=cv, scoring=scoring, n_jobs=-1)
        cv_results[name] = {
            'mean_accuracy': cv_scores.mean(),
            'std_accuracy': cv_scores.std(),
            'all_scores': cv_scores
        }
        print(f" {name:25s}: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    except Exception as e:
        print(f" {name:25s}: Error - {str(e)}")
        cv_results[name] = {'mean_accuracy': 0, 'std_accuracy': 0, 'all_scores': []}

# Display results in sorted order
print(" Cross-Validation Results (Sorted by Performance):")
print("=" * 60)
sorted_results = sorted(cv_results.items(), key=lambda x: x[1]['mean_accuracy'], reverse=True)

for name, results in sorted_results:
    print(f"{name:25s}: {results['mean_accuracy']:.4f} ± {results['std_accuracy']:.4f}")

# Identify best model
best_model_name = sorted_results[0][0]
best_cv_score = sorted_results[0][1]['mean_accuracy']
print(f" Best Model: {best_model_name} (CV Accuracy: {best_cv_score:.4f})")

In [None]:
# Create visualization of model performance
plt.figure(figsize=(14, 10))
model_names = [name for name, _ in sorted_results]
mean_scores = [results['mean_accuracy'] for _, results in sorted_results]
std_scores = [results['std_accuracy'] for _, results in sorted_results]

colors = plt.cm.Set3(np.linspace(0, 1, len(model_names)))
bars = plt.barh(range(len(model_names)), mean_scores, xerr=std_scores,
                alpha=0.8, color=colors, edgecolor='black', height=0.7)

plt.yticks(range(len(model_names)), model_names)
plt.xlabel('Accuracy Score', fontsize=12)
plt.title('Model Performance Comparison(5-Fold Cross-Validation ± Standard Deviation)', fontsize=14, fontweight='bold')
plt.xlim(0, 1)
plt.grid(axis='x', alpha=0.3)

# Add value labels
for i, (mean, std) in enumerate(zip(mean_scores, std_scores)):
    plt.text(mean + 0.01, i, f'{mean:.3f} ± {std:.3f}',
             va='center', fontweight='bold', fontsize=10)

plt.tight_layout()
plt.show()

# Show individual fold scores for the best model
print(f" Detailed CV Scores for {best_model_name}:")
best_scores = cv_results[best_model_name]['all_scores']
for fold, score in enumerate(best_scores, 1):
    print(f"  Fold {fold}: {score:.4f}")

In [None]:
adaBoost = AdaBoostClassifier(n_estimators=200, random_state=13)

adaBoost.fit(X_train, y_train)

# Evaluate on test set
y_pred = adaBoost.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Final Performance Comparison:")
print(f"Cross-Validation Accuracy: {best_cv_score:.4f}")
print(f"Test Set Accuracy: {test_accuracy:.4f}")
print(f"Generalization Gap: {abs(best_cv_score - test_accuracy):.4f}")

if abs(best_cv_score - test_accuracy) < 0.05:
    print("Good generalization: CV and test performance are close!")
else:
    print("Potential overfitting: Large gap between CV and test performance")

print(" Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))

coef = adaBoost.feature_importances_

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': coef,
    'abs_importance': np.abs(coef)
}).sort_values('abs_importance', ascending=False)

print(f"AdaBoost features importance:\n {feature_importance}")

In [None]:
# Define low-importance features (based on feature_importances_)
drop_features = ['restecg', 'fbs']

# Drop those features from training and test sets
X_train_reduced = X_train.drop(columns=drop_features)
X_test_reduced = X_test.drop(columns=drop_features)

# Initialize AdaBoost with tuned hyperparameters
ada_reduced = AdaBoostClassifier(n_estimators=200, random_state=13)

ada_reduced.fit(X_train_reduced, y_train)

# Evaluate on test set
y_pred = ada_reduced.predict(X_test_reduced)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Final Performance Comparison:")
print(f"Cross-Validation Accuracy: {best_cv_score:.4f}")
print(f"Test Set Accuracy: {test_accuracy:.4f}")
print(f"Generalization Gap: {abs(best_cv_score - test_accuracy):.4f}")

if abs(best_cv_score - test_accuracy) < 0.05:
    print("Good generalization: CV and test performance are close!")
else:
    print("Potential overfitting: Large gap between CV and test performance")

print(" Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))

coef = ada_reduced.feature_importances_

feature_importance = pd.DataFrame({
    'feature': X_train_reduced.columns,
    'coefficient': coef,
    'abs_importance': np.abs(coef)
}).sort_values('abs_importance', ascending=False)

print(f"AdaBoost features importance:\n {feature_importance}")

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_curve, auc

# Predict probabilities for the positive class
y_proba = ada_reduced.predict_proba(X_test_reduced)[:, 1]

# Compute ROC-AUC
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC: {roc_auc:.4f}")

# Confusion matrix
y_pred = ada_reduced.predict(X_test_reduced)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Precision-Recall curve and PR-AUC
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

plt.figure()
plt.plot(recall, precision, label=f'PR-AUC = {pr_auc:.4f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

In [None]:
# Model Saving
import pickle

model_path = "../trained_models/heart_attack-ada_boost_model.pkl"

with open(model_path, 'wb') as f:
    pickle.dump(ada_reduced, f)