# An Illustration of Gradient Boosting

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree

%matplotlib inline

In [None]:
X = np.linspace(0, 20, 201)
y = np.sin(X)

fig, ax = plt.subplots()
ax.plot(X, y);

### Step 1 

Here, all we are doing is using an average of our values as a model of our data. Note that the "actual function" of our data is a sine curve. We are going to try to approximate that by way of gradient boosting and a decision tree regressor!

In [None]:
fig, ax = plt.subplots()
ax.plot(X, y)
f0 = y.mean()
ax.hlines(f0, 0, 20);

##### OK, so we have our extremely basic, extremely inaccurate model. Let's go ahead and build off of that.

In [None]:
f0

In [None]:
# Residuals

e0 = y - f0
e0

In [None]:
## Plotting residuals

fig, ax = plt.subplots()
ax.scatter(X, e0);

### Step 2: fitting a "stump" to the residuals

In [None]:
data = X.reshape(-1, 1)
f1 = DecisionTreeRegressor(max_depth=1)

f1.fit(data, e0)
ensemble_preds_1 = f1.predict(data) + f0

fig, ax = plt.subplots()
ax.plot(X, y)
ax.scatter(X, ensemble_preds_1,
          c='r');

In [None]:
# The DecisionTreeRegressor is minimizing
# mean squared error. Since we're only
# splitting once, we're simply predicting
# the mean of each of the two groups formed
# by the split.

mses = []
for j in range(1, 63):
    mse = sum((e0[:j] - e0[:j].mean())**2)
    mse += sum((e0[j:] - e0[j:].mean())**2)
    mses.append(mse)
mses.index(min(mses))

In [None]:
mses

In [None]:
plot_tree(f1);

In [None]:
e0[:29].mean()

In [None]:
e0[29:].mean()

In [None]:
f1.predict(data)

### Step 3: fitting another "stump" to the residuals of the previous model

In [None]:
e1 = y - (f0 + f1.predict(data))

In [None]:
f2 = DecisionTreeRegressor(max_depth=1)
f2.fit(data, e1)
ensemble_preds_2 = f1.predict(data) + f2.predict(data) + f0

fig, ax = plt.subplots()
ax.scatter(X, e1)
ax.scatter(X, ensemble_preds_2)
plt.title('fitting to residuals (y - (f0(x) + f1(x)))');

In [None]:
fig, ax = plt.subplots()
ax.plot(X, y)
ax.scatter(X, ensemble_preds_2,
          c='r')
plt.title('Model v. our data');

### Fitting yet another "stump"

In [None]:
e2 = y - (f2.predict(data) + f1.predict(data) + f0)
f3 = DecisionTreeRegressor(max_depth=1)
f3.fit(data, e2)

fig, ax = plt.subplots()
ax.scatter(X, e2)
ensemble_preds_3 = f3.predict(data) + f2.predict(data) + f1.predict(data) + f0
ax.scatter(X, ensemble_preds_3)
plt.title('fitting to residuals (y - (f0(x) + f1(x) + f2(x)))');

In [None]:
fig, ax = plt.subplots()
ax.plot(X, np.sin(X))
ax.scatter(X, ensemble_preds_3,
          c='r')
plt.title('Model v. our data');

### Another one

In [None]:
e3 = y - (f3.predict(data) + f2.predict(data) + f1.predict(data) + f0)
f4 = DecisionTreeRegressor(max_depth=1)
f4.fit(data, e3)

fig, ax = plt.subplots()
ax.scatter(X, e3)
ensemble_preds_4 = f4.predict(data) + f3.predict(data) + f2.predict(data)\
    + f1.predict(data) + f0
ax.scatter(X, ensemble_preds_4);

In [None]:
fig, ax = plt.subplots()
ax.plot(X, np.sin(X))
ax.scatter(X, ensemble_preds_4,
          c='r')
plt.title('Model v. our data');

### and another

In [None]:
e4 = np.sin(X) - (f4.predict(data) + f3.predict(data) + f2.predict(data)\
                  + f1.predict(data) + f0)
f5 = DecisionTreeRegressor(max_depth=1)
f5.fit(data, e4)

fig, ax = plt.subplots()
ax.scatter(X, e4)
ensemble_preds_5 = f5.predict(data) + f4.predict(data) + f3.predict(data)\
    + f2.predict(data) + f1.predict(data) + f0
ax.scatter(X, ensemble_preds_5);

In [None]:
fig, ax = plt.subplots()
ax.plot(X, np.sin(X))
ax.scatter(X, ensemble_preds_5,
          c='r')
plt.title('Model v. our data');

### Let's make a function already!

In [None]:
def simple_boosting_algorithm(X, y, n_learners, learner,
                              learning_rate, show_each_step=True):
    """Performs a simple ensemble boosting model 
    params: show_each_step - if True, will show with each additional learner"""
    f0 = y.mean()
    residuals = y - f0
    
    # This next line fills an array of len(y) with the mean of y.
    ensemble_predictions = np.full(len(y), fill_value=f0)
    fig, ax = plt.subplots(figsize=(20, 10))

    for i in range(n_learners):
        residuals = y - ensemble_predictions
        f = learner.fit(X.reshape(-1, 1), residuals)
        ensemble_predictions = learning_rate * f.predict(X.reshape(-1, 1)) +\
        ensemble_predictions
        if show_each_step:
            ax.plot(X, y)
            ax.scatter(X, ensemble_predictions,
                      c='r')
            
    ax.plot(X, y)
    ax.scatter(X, ensemble_predictions,
              c='r')
            
    plt.title('With ' + str(n_learners) + ' learners with a depth of '+\
              str(learner.max_depth) +\
              ' and a learning rate of '+ str(learning_rate))
    
   

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=1,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.001,
                          show_each_step=False)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=100,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.01,
                          show_each_step=False)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=10000,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.001,
                          show_each_step=False)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=100000,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.01,
                          show_each_step=False)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=20,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.1,
                          show_each_step=True)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=60,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.1,
                          show_each_step=True)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=80,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.1,
                          show_each_step=True)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=200,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.1,
                          show_each_step=True)

In [None]:
simple_boosting_algorithm(X=X,
                          y=y,
                          n_learners=1000,
                          learner=DecisionTreeRegressor(max_depth=1),
                          learning_rate=0.01,
                          show_each_step=True)