In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the processed data
data = pd.read_csv('heart_disease_processed.csv')

In [3]:
# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

In [4]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

In [6]:
# Train and evaluate models
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5)
    
    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std()
    }
    
    # Print results
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Cross-validation mean accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(f'confusion_matrix_{name.replace(" ", "_").lower()}.png')
    plt.close()


Logistic Regression:
Accuracy: 0.8195
Cross-validation mean accuracy: 0.8585 (+/- 0.0545)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.75      0.80       102
         1.0       0.78      0.89      0.83       103

    accuracy                           0.82       205
   macro avg       0.83      0.82      0.82       205
weighted avg       0.83      0.82      0.82       205


Decision Tree:
Accuracy: 0.9854
Cross-validation mean accuracy: 0.9912 (+/- 0.0143)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       102
         1.0       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205


Random Forest:
Accuracy: 0.9854
Cross-validation mean accuracy: 0.9971 (+/- 0.0117)

Classification Report:
             

In [7]:
# Compare model performances
results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df)


Model Comparison:
                     Accuracy   CV Mean    CV Std
Logistic Regression  0.819512  0.858537  0.027247
Decision Tree        0.985366  0.991220  0.007169
Random Forest        0.985366  0.997073  0.005854
SVM                  0.882927  0.923902  0.024988


In [8]:
# Visualize model comparison
plt.figure(figsize=(10, 6))
results_df[['Accuracy', 'CV Mean']].plot(kind='bar', yerr=results_df['CV Std'])
plt.title('Model Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()

<Figure size 1000x600 with 0 Axes>

In [9]:
# Identify the best model
best_model_name = results_df['CV Mean'].idxmax()
best_model = models[best_model_name]

In [10]:
print(f"\nBest performing model: {best_model_name}")
print(f"Cross-validation mean accuracy: {results_df.loc[best_model_name, 'CV Mean']:.4f}")


Best performing model: Random Forest
Cross-validation mean accuracy: 0.9971


In [13]:
# Feature importance for the best model (if applicable)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
    plt.title(f'Top 10 Important Features - {best_model_name}')
    plt.tight_layout()
    plt.savefig('feature_importance_best_model.png')
    plt.close()
    
    print("\nTop 5 important features:")
    print(feature_importance.head())
elif best_model_name == 'Logistic Regression':
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': abs(best_model.coef_[0])
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
    plt.title(f'Top 10 Important Features - {best_model_name}')
    plt.tight_layout()
    plt.savefig('feature_importance_best_model.png')
    plt.close()
    
    print("\nTop 5 important features:")
    print(feature_importance.head())


Top 5 important features:
     feature  importance
7     cp_0.0    0.098913
4    oldpeak    0.095042
3    thalach    0.092834
21    ca_0.0    0.083042
28  thal_2.0    0.082802
