In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

# --- 0. Data Preparation ---
# Load data and split into features (X) and target (y)
df = pd.read_csv('heart.csv')
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
feature_names = X.columns.tolist()

# --- 1. Train a Decision Tree Classifier and visualize the tree ---
print("--- 1. Decision Tree Training and Visualization ---")
# Limit max_depth for a readable visualization
dt_classifier_viz = DecisionTreeClassifier(random_state=42, max_depth=3)
dt_classifier_viz.fit(X_train, y_train)
accuracy_dt_viz = dt_classifier_viz.score(X_test, y_test)
print(f"Decision Tree Classifier (max_depth=3) Test Accuracy: {accuracy_dt_viz:.4f}")

# Visualize the tree and save to 'decision_tree.png'
plt.figure(figsize=(20, 10))
plot_tree(dt_classifier_viz,
          feature_names=feature_names,
          class_names=['No Disease', 'Disease'],
          filled=True,
          rounded=True,
          fontsize=10)
plt.title("Decision Tree Classifier (max_depth=3)")
plt.savefig('decision_tree.png')
plt.close()


# --- 2. Analyze overfitting and control tree depth ---
print("\n--- 2. Overfitting Analysis and Depth Control ---")
depths = range(1, 16)
train_scores = []
test_scores = []

for depth in depths:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    train_scores.append(dt.score(X_train, y_train))
    test_scores.append(dt.score(X_test, y_test))

# Find the depth with the maximum test accuracy
optimal_depth = depths[np.argmax(test_scores)]
max_test_accuracy = np.max(test_scores)

# Plotting the results to analyze overfitting and saving to 'depth_analysis.png'
plt.figure(figsize=(10, 6))
plt.plot(depths, train_scores, label='Training Accuracy', marker='o')
plt.plot(depths, test_scores, label='Testing Accuracy', marker='o')
plt.plot(optimal_depth, max_test_accuracy, 'r*', markersize=15, label=f'Optimal Depth: {optimal_depth} ({max_test_accuracy:.4f})')
plt.title('Decision Tree Accuracy vs. Tree Depth')
plt.xlabel('Max Tree Depth')
plt.ylabel('Accuracy')
plt.xticks(depths)
plt.legend()
plt.grid(True)
plt.savefig('depth_analysis.png')
plt.close()

# Retrain with the optimal depth
dt_optimal = DecisionTreeClassifier(max_depth=optimal_depth, random_state=42)
dt_optimal.fit(X_train, y_train)
print(f"Optimal Tree Depth: {optimal_depth}")
print(f"Decision Tree Classifier (Optimal Depth={optimal_depth}) Test Accuracy: {max_test_accuracy:.4f}")


# --- 3. Train a Random Forest and compare accuracy ---
print("\n--- 3. Random Forest Training and Comparison ---")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

accuracy_rf = rf_classifier.score(X_test, y_test)
print(f"Random Forest Classifier Test Accuracy: {accuracy_rf:.4f}")


# --- 4. Interpret feature importances (from Random Forest) ---
print("\n--- 4. Interpret Feature Importances ---")
importances = rf_classifier.feature_importances_
sorted_indices = np.argsort(importances)[::-1]

# Plot feature importances and saving to 'feature_importances.png'
plt.figure(figsize=(12, 6))
plt.title("Feature Importances (Random Forest)")
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), np.array(feature_names)[sorted_indices], rotation=45, ha='right')
plt.tight_layout()
plt.savefig('feature_importances.png')
plt.close()


# --- 5. Evaluate using cross-validation ---
print("\n--- 5. Evaluate using Cross-Validation (5-Fold) ---")
# Evaluate both models using 5-fold cross-validation on the whole dataset (X, y)
cv_dt_scores = cross_val_score(dt_optimal, X, y, cv=5, scoring='accuracy')
cv_rf_scores = cross_val_score(rf_classifier, X, y, cv=5, scoring='accuracy')

print(f"Optimal Decision Tree CV Mean Accuracy: {cv_dt_scores.mean():.4f} (Std: {cv_dt_scores.std():.4f})")
print(f"Random Forest CV Mean Accuracy: {cv_rf_scores.mean():.4f} (Std: {cv_rf_scores.std():.4f})")

--- 1. Decision Tree Training and Visualization ---
Decision Tree Classifier (max_depth=3) Test Accuracy: 0.8052

--- 2. Overfitting Analysis and Depth Control ---
Optimal Tree Depth: 10
Decision Tree Classifier (Optimal Depth=10) Test Accuracy: 0.9708

--- 3. Random Forest Training and Comparison ---
Random Forest Classifier Test Accuracy: 0.9805

--- 4. Interpret Feature Importances ---

--- 5. Evaluate using Cross-Validation (5-Fold) ---
Optimal Decision Tree CV Mean Accuracy: 1.0000 (Std: 0.0000)
Random Forest CV Mean Accuracy: 0.9971 (Std: 0.0059)
