# Task 5 — Decision Trees & Random Forests (Heart Disease)

**Objective:**
- Train Decision Tree & Random Forest models
- Visualize and interpret them
- Study overfitting (tree depth)
- Compare accuracy
- Evaluate with cross-validation

> **Dataset expected:** `heart.csv`. Place it in the same folder as this notebook.

In [None]:
import os, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

plt.rcParams['figure.figsize'] = (8,5)

In [None]:
# Locate heart.csv
candidates = glob.glob('**/heart*.csv', recursive=True)
print("CSV candidates:", candidates)
csv_path = candidates[0] if candidates else 'heart.csv'
if not os.path.exists(csv_path):
    raise FileNotFoundError("heart.csv not found. Please put it in this folder.")

df = pd.read_csv(csv_path)
print("Shape:", df.shape)
df.head()

In [None]:
print(df.dtypes)
print("\nMissing values:\n", df.isna().sum())
print("\nTarget distribution:\n", df['target'].value_counts())

y = df['target'].astype(int)
X = df.drop(columns=['target'])

In [None]:
categorical_like = ['cp','restecg','slope','thal','ca','sex','fbs','exang']
categorical_like = [c for c in categorical_like if c in X.columns]
numeric_cols = [c for c in X.columns if c not in categorical_like]

preprocess = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), numeric_cols),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                      ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_like)
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape

In [None]:
# Decision Tree baseline
dt = Pipeline([('prep', preprocess), ('clf', DecisionTreeClassifier(random_state=42))])
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print("Decision Tree accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nReport:\n", classification_report(y_test, y_pred_dt, digits=4))
print("ROC AUC:", roc_auc_score(y_test, dt.predict_proba(X_test)[:,1]))

# Visualize shallow tree
dt_small = Pipeline([('prep', preprocess), ('clf', DecisionTreeClassifier(max_depth=3, random_state=42))])
dt_small.fit(X_train, y_train)
plt.figure(figsize=(14,8))
plot_tree(dt_small.named_steps['clf'], filled=True, feature_names=numeric_cols, class_names=['No','Yes'])
plt.show()

In [None]:
# Overfitting check
depths = range(1, 15)
train_scores, val_scores = [], []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for d in depths:
    pipe = Pipeline([('prep', preprocess), ('clf', DecisionTreeClassifier(max_depth=d, random_state=42))])
    val_scores.append(cross_val_score(pipe, X_train, y_train, cv=skf, scoring='accuracy').mean())
    pipe.fit(X_train, y_train)
    train_scores.append(accuracy_score(y_train, pipe.predict(X_train)))

plt.plot(depths, train_scores, label='Train')
plt.plot(depths, val_scores, label='CV')
plt.xlabel('max_depth'); plt.ylabel('Accuracy'); plt.legend(); plt.show()

In [None]:
# Random Forest
rf = Pipeline([('prep', preprocess), ('clf', RandomForestClassifier(n_estimators=300, random_state=42))])
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nReport:\n", classification_report(y_test, y_pred_rf, digits=4))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

# Feature importance
importances = rf.named_steps['clf'].feature_importances_
fi = pd.DataFrame({'feature': X.columns, 'importance': importances})
fi.sort_values('importance', ascending=False).head(10).plot.barh(x='feature', y='importance')
plt.show()

In [None]:
# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_dt = cross_val_score(DecisionTreeClassifier(max_depth=5, random_state=42), X, y, cv=skf, scoring='accuracy')
scores_rf = cross_val_score(RandomForestClassifier(n_estimators=300, random_state=42), X, y, cv=skf, scoring='accuracy')

print("Decision Tree CV:", scores_dt.mean(), "+/-", scores_dt.std())
print("Random Forest CV:", scores_rf.mean(), "+/-", scores_rf.std())