# Notebook 05: Tree and Ensemble Models

Understand decision trees, random forests, and gradient boosting.

## Learning Objectives
- Understand decision tree mechanics
- Explore tree hyperparameters
- Learn ensemble methods (Bagging, Boosting)
- Visualize feature importance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_moons, make_regression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor,
    AdaBoostClassifier, BaggingClassifier
)
from sklearn.metrics import accuracy_score, mean_squared_error

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

## Part 1: Decision Tree Fundamentals

In [None]:
# Generate simple data
X, y = make_moons(n_samples=300, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple decision tree
tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(X_train, y_train)

print(f"Training accuracy: {tree.score(X_train, y_train):.4f}")
print(f"Test accuracy: {tree.score(X_test, y_test):.4f}")

In [None]:
# Visualize the tree
plt.figure(figsize=(20, 10))
plot_tree(tree, filled=True, feature_names=['X1', 'X2'], 
          class_names=['Class 0', 'Class 1'], fontsize=10)
plt.title('Decision Tree Visualization')
plt.show()

In [None]:
# Decision boundary function
def plot_decision_boundary(model, X, y, ax, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black', s=30)
    ax.set_title(title)

# Compare different max_depth values
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

depths = [1, 2, 3, 5, 10, None]

for idx, depth in enumerate(depths):
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    
    train_acc = tree.score(X_train, y_train)
    test_acc = tree.score(X_test, y_test)
    
    depth_str = str(depth) if depth else 'None'
    plot_decision_boundary(tree, X, y, axes[idx], 
                          f'max_depth={depth_str}\nTrain: {train_acc:.2f}, Test: {test_acc:.2f}')

plt.tight_layout()
plt.show()

## Part 2: Tree Hyperparameters

In [None]:
# Generate more complex data
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    n_classes=2, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Explore min_samples_split
min_samples_values = [2, 5, 10, 20, 50, 100]

train_scores = []
test_scores = []

for min_samples in min_samples_values:
    tree = DecisionTreeClassifier(min_samples_split=min_samples, random_state=42)
    tree.fit(X_train, y_train)
    
    train_scores.append(tree.score(X_train, y_train))
    test_scores.append(tree.score(X_test, y_test))

plt.figure(figsize=(10, 6))
plt.plot(min_samples_values, train_scores, 'bo-', label='Train')
plt.plot(min_samples_values, test_scores, 'ro-', label='Test')
plt.xlabel('min_samples_split')
plt.ylabel('Accuracy')
plt.title('Effect of min_samples_split on Tree Performance')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Grid search for optimal tree parameters
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

tree = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV score: {grid_search.best_score_:.4f}")
print(f"Test score: {grid_search.score(X_test, y_test):.4f}")

## Part 3: Random Forest

In [None]:
# Compare single tree vs random forest
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Single tree
tree = DecisionTreeClassifier(max_depth=10, random_state=42)
tree.fit(X_train, y_train)

# Random forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plot_decision_boundary(tree, X, y, axes[0], 
                      f'Decision Tree\nTest Acc: {tree.score(X_test, y_test):.3f}')
plot_decision_boundary(rf, X, y, axes[1], 
                      f'Random Forest (100 trees)\nTest Acc: {rf.score(X_test, y_test):.3f}')

plt.tight_layout()
plt.show()

In [None]:
# Effect of n_estimators
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    n_classes=2, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_estimators_range = [1, 5, 10, 25, 50, 100, 200, 500]

train_scores = []
test_scores = []
oob_scores = []

for n_est in n_estimators_range:
    rf = RandomForestClassifier(n_estimators=n_est, oob_score=True, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    train_scores.append(rf.score(X_train, y_train))
    test_scores.append(rf.score(X_test, y_test))
    oob_scores.append(rf.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_scores, 'bo-', label='Train')
plt.plot(n_estimators_range, test_scores, 'ro-', label='Test')
plt.plot(n_estimators_range, oob_scores, 'go-', label='OOB')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.title('Random Forest: Effect of n_estimators')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Effect of max_features
max_features_options = ['sqrt', 'log2', 0.3, 0.5, 0.7, 1.0]

results = []

for max_feat in max_features_options:
    rf = RandomForestClassifier(n_estimators=100, max_features=max_feat, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    results.append({
        'max_features': str(max_feat),
        'train_acc': rf.score(X_train, y_train),
        'test_acc': rf.score(X_test, y_test)
    })

df_results = pd.DataFrame(results)
print("Effect of max_features:")
print(df_results.to_string(index=False))

## Part 4: Feature Importance

In [None]:
# Train random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot
plt.figure(figsize=(12, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), [f'F{i}' for i in indices], rotation=45)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importances')
plt.tight_layout()
plt.show()

print("Top 10 most important features:")
for i in range(10):
    print(f"  Feature {indices[i]}: {importances[indices[i]]:.4f}")

## Part 5: Gradient Boosting

In [None]:
# Compare Random Forest vs Gradient Boosting
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42)
}

print("Ensemble Methods Comparison:")
print("="*50)

for name, model in models.items():
    model.fit(X_train, y_train)
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    
    print(f"\n{name}:")
    print(f"  Train Accuracy: {train_acc:.4f}")
    print(f"  Test Accuracy: {test_acc:.4f}")
    print(f"  CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

In [None]:
# Gradient Boosting: learning_rate vs n_estimators tradeoff
learning_rates = [0.01, 0.1, 0.5, 1.0]
n_estimators_range = [10, 50, 100, 200]

results = []

for lr in learning_rates:
    for n_est in n_estimators_range:
        gb = GradientBoostingClassifier(
            learning_rate=lr, n_estimators=n_est, random_state=42
        )
        gb.fit(X_train, y_train)
        
        results.append({
            'learning_rate': lr,
            'n_estimators': n_est,
            'test_acc': gb.score(X_test, y_test)
        })

# Create heatmap
df_results = pd.DataFrame(results)
pivot = df_results.pivot(index='learning_rate', columns='n_estimators', values='test_acc')

plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlOrRd')
plt.title('Gradient Boosting: learning_rate vs n_estimators')
plt.show()

In [None]:
# Staged predictions - see how GB improves with more trees
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

# Get staged predictions
train_scores_staged = []
test_scores_staged = []

for i, y_pred in enumerate(gb.staged_predict(X_train)):
    train_scores_staged.append(accuracy_score(y_train, y_pred))

for i, y_pred in enumerate(gb.staged_predict(X_test)):
    test_scores_staged.append(accuracy_score(y_test, y_pred))

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(train_scores_staged) + 1), train_scores_staged, 'b-', label='Train')
plt.plot(range(1, len(test_scores_staged) + 1), test_scores_staged, 'r-', label='Test')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.title('Gradient Boosting: Learning Progress')
plt.legend()
plt.grid(True)
plt.show()

## Part 6: Regression with Trees

In [None]:
# Generate regression data
X, y = make_regression(n_samples=500, n_features=10, n_informative=5, noise=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compare models
from sklearn.metrics import r2_score

models_reg = {
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

print("Regression Models Comparison:")
print("="*50)

for name, model in models_reg.items():
    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"\n{name}:")
    print(f"  Train R²: {train_r2:.4f}")
    print(f"  Test R²: {test_r2:.4f}")

## Part 7: Bagging vs Boosting Visualization

In [None]:
# Compare decision boundaries
X, y = make_moons(n_samples=300, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models_compare = {
    'Single Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Bagging': BaggingClassifier(n_estimators=50, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, random_state=42)
}

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, (name, model) in enumerate(models_compare.items()):
    model.fit(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    
    plot_decision_boundary(model, X, y, axes[idx], f'{name}\nTest Acc: {test_acc:.3f}')

plt.tight_layout()
plt.show()

## Part 8: Hyperparameter Tuning for Random Forest

In [None]:
# Grid search for Random Forest
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10, n_classes=2, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

print("\nBest Random Forest Parameters:")
for param, value in grid_search_rf.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV Score: {grid_search_rf.best_score_:.4f}")
print(f"Test Score: {grid_search_rf.score(X_test, y_test):.4f}")

## Summary

In this notebook, you learned:

### Decision Trees
- Tree structure and splitting criteria
- Key parameters: max_depth, min_samples_split, min_samples_leaf
- Prone to overfitting without constraints

### Ensemble Methods
- **Bagging**: Reduce variance, parallel training
- **Random Forest**: Bagging + feature randomization
- **Boosting**: Reduce bias, sequential training
- **Gradient Boosting**: Most powerful, careful tuning needed

### Key Parameters
- n_estimators: More trees = better (with diminishing returns)
- max_features: Controls diversity in Random Forest
- learning_rate: Controls step size in Boosting

### Key Takeaways
- Ensembles almost always outperform single trees
- Random Forest is robust and easy to tune
- Gradient Boosting often achieves best performance
- Feature importance helps interpretability

### Next Steps
Continue to **Notebook 06** for SVM and kernel methods.