Random forests
---

In [None]:
from sklearn.datasets import make_circles

# Generate circle data set
X, y = make_circles(
    n_samples=200, shuffle=True, noise=0.3, random_state=0, factor=0.3)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Same scale for x- and y-axis
fig, ax = plt.subplots()
ax.set_aspect('equal', adjustable='box')

# Plot data
class1_idx = (y == 1)
ax.scatter(X[:, 0][class1_idx], X[:, 1][class1_idx], c='C3', label='class 1')
ax.scatter(X[:, 0][~class1_idx], X[:, 1][~class1_idx], c='C0', label='class 0')
ax.legend()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a decision tree
dt = RandomForestClassifier(
    n_estimators=1, max_depth=5, random_state=0)

# Fit estimator
dt.fit(X, y)

In [None]:
import numpy as np

# Helper function
def decision_surface(ax, x1, x2, y, estimator):
    # Same scale for x- and y-axis
    ax.set_aspect('equal', adjustable='box')

    # Plot data points
    class1_idx = (y == 1)
    plt.scatter(x1[class1_idx], x2[class1_idx], color='C3', label='class 1')
    plt.scatter(x1[~class1_idx], x2[~class1_idx], color='C0', label='class 0')

    # Create a grid of values
    xlim, ylim = ax.get_xlim(), ax.get_ylim()
    x_values = np.linspace(*xlim, num=500)
    y_values = np.linspace(*ylim, num=500)
    xx, yy = np.meshgrid(x_values, y_values)
    grid_points = np.c_[xx.flatten(), yy.flatten()]

    # Compute predictions
    preds = estimator.predict(grid_points)
    zz = preds.reshape(xx.shape)

    # Draw decision boundary
    plt.contour(xx, yy, zz, levels=[0.5], colors='gray')
    
    # Plot decision surface with level curves
    plt.contourf(xx, yy, zz, alpha=0.1, cmap=plt.cm.coolwarm)
    plt.legend()
    plt.show()

In [None]:
# Plot decision surface
fig, ax = plt.subplots()
decision_surface(ax, X[:, 0], X[:, 1], y, dt)

In [None]:
# Create a (deep) decision tree
dt_max_depth = RandomForestClassifier(
    n_estimators=1, max_depth=None, random_state=0)

# Fit estimator
dt_max_depth.fit(X, y)

# Plot decision surface
fig, ax = plt.subplots()
decision_surface(ax, X[:, 0], X[:, 1], y, dt_max_depth)

In [None]:
# Create random forest estimator
rf_10 = RandomForestClassifier(
    n_estimators=10, max_depth=None, random_state=0)

# Fit estimator
rf_10.fit(X, y)

# Plot decision surface
fig, ax = plt.subplots()
decision_surface(ax, X[:, 0], X[:, 1], y, rf_10)

In [None]:
# Create a (large) random forest estimator
rf_200 = RandomForestClassifier(
    n_estimators=200, max_depth=None, random_state=0)

# Fit estimator
rf_200.fit(X, y)

# Plot decision surface
fig, ax = plt.subplots()
decision_surface(ax, X[:, 0], X[:, 1], y, rf_200)

In [None]:
from sklearn.model_selection import cross_validate

# Mean test score of a single decision tree (max depth)
dt_scores = cross_validate(dt_max_depth, X, y, cv=10)
print('Decision tree - mean test {:.3f}'.format(
    np.mean(dt_scores['test_score'])))
# Prints: 0.780

# Mean test score of a random forest (200x, max depth)
rf_scores = cross_validate(rf_200, X, y, cv=10)
print('Random forest - mean test {:.3f}'.format(
    np.mean(rf_scores['test_score'])))
# Prints: 0.825

Algorithm behind random forests
---

```python
# Build each tree from the random forest
for tree in random forest:
    # Bagging
    X_subset = draw a random subset of data points

    # Build each split recursively
    for split in tree:
        # "Randomized feature selection"
        idxs = draw a random subset of features

        # For each feature: find best "split condition"
        for i in idxs:
            # Conditions of form: X_subset[i] < k
            split = Find k with minimum impurity

        # Split tree
        Use split with minimum impurity
```