# Heart Disease Prediction — EDA + Machine Learning
This notebook performs EDA and compares multiple classifiers to predict heart disease.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
sns.set(style='whitegrid')
%matplotlib inline

In [None]:
df = pd.read_csv("heart.csv")
print("Shape:", df.shape)
df.head()

In [None]:
display(df.info())
display(df.describe())

# Drop duplicates if any
df.drop_duplicates(inplace=True)
print('Missing values:\n', df.isnull().sum())

In [None]:
cols = ['age','trestbps','chol','thalach','oldpeak']
plt.figure(figsize=(15,10))
for i, col in enumerate(cols):
    plt.subplot(2,3,i+1)
    sns.histplot(df[col], kde=True)
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='num', data=df, palette='Set2')
plt.title('Target distribution (0 = No disease, 1 = Disease)')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='num', y='age', data=df)
plt.title('Age vs Heart Disease')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
X = df.drop('num', axis=1)
y = df['num']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='rbf', probability=True)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(name, 'accuracy:', acc)

print('\nSummary of accuracies:')
for k,v in results.items():
    print(k, ':', v)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
feat_names = df.drop('num', axis=1).columns
imp_df = pd.DataFrame({'feature':feat_names, 'importance':importances}).sort_values(by='importance', ascending=False)
imp_df

In [None]:
best = RandomForestClassifier(n_estimators=100, random_state=42)
best.fit(X_train, y_train)
y_pred = best.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

## Conclusion
- Performed EDA and compared multiple models.
- Random Forest gives feature importance and competitive accuracy.

### Next steps
- Hyperparameter tuning
- Cross-validation
- Use full UCI dataset for better results