Modify the Gradient Boosting scratch code in our lecture such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report
from sklearn.datasets import load_digits
import numpy as np

In [2]:
X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(354, 13)
(354,)
(152, 13)
(152,)


In [3]:
class GradientBoosting:
    def __init__(self, n_estimators=100, max_depth=2, min_sample_split=5, learning_rate=1, 
                isClassification=False) -> None:
        tree_params = {'max_depth': max_depth, 'min_samples_split': min_sample_split}
        self.models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]
        self.learning_rate = learning_rate
        self.isClassification = isClassification

    def grad(self, y, h):
        return y - h

    def fit(self, X, y):
        
        if self.isClassification:
            y = self.oneHotEncoding(y)

        models_trained = []
        #using DummyRegressor is a good technique for starting model
        first_model = DummyRegressor(strategy='mean')
        first_model.fit(X, y)
        models_trained.append(first_model)
        
        #fit the estimators
        for i, model in enumerate(self.models):
            #predict using all the weak learners we trained up to
            #this point
            y_pred = self.predict(X, models_trained)
            
            #errors will be the total errors maded by models_trained
            residual = self.grad(y, y_pred)
            
            #fit the next model with residual
            model.fit(X, residual)

            models_trained.append(model)
            
        self.models = models_trained
        return models_trained

    def oneHotEncoding(self, y):
        num_class = len(np.unique(y))
        onehot = np.zeros((len(y), num_class))
        for idx, class_idx in enumerate(y):
            onehot[idx, class_idx] = 1
        return onehot


    def softmax(self, X):
        soft_values = np.zeros(X.shape)
        for idx, row in enumerate(X):
            values = np.exp(row)
            devider = sum(values)
            soft_values[idx, :] = values/devider
        return soft_values

            
    def predict(self, X, models, return_argmax = False):
        f0 = models[0].predict(X)  #first use the dummy model
        boosting = sum(self.learning_rate * model.predict(X) for model in models[1:])
        yhat = f0 + boosting
        if self.isClassification:
            yhat = self.softmax(yhat)
            if return_argmax:
                yhat = np.argmax(yhat, axis=1)
        return yhat

    

In [4]:
model = GradientBoosting(n_estimators=200, max_depth=1, min_sample_split=5, learning_rate=0.1)
tree = model.fit(X_train, y_train)
yhat = model.predict(X_test, tree)
print("Our MSE: ", mean_squared_error(y_test, yhat))

Our MSE:  12.945557601580584


Try to adjust some hyper parameter

In [5]:
model2 = GradientBoosting(n_estimators=200, max_depth=3, min_sample_split=10, learning_rate=0.1)
tree = model2.fit(X_train, y_train)
yhat = model2.predict(X_test, tree)
print("Our MSE: ", mean_squared_error(y_test, yhat))

Our MSE:  7.841209570276317


Test model with classification breast cancer.

In [6]:
X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print('Target class:', np.unique(y_train))

(398, 30)
(398,)
(171, 30)
(171,)
Target class: [0 1]


In [7]:
model3 = GradientBoosting(n_estimators=200, max_depth=3, min_sample_split=10, learning_rate=0.1, isClassification=True)
tree = model3.fit(X_train, y_train)
yhat = model3.predict(X_test, tree, return_argmax=True)
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95        63
           1       0.98      0.96      0.97       108

   micro avg       0.96      0.96      0.96       171
   macro avg       0.96      0.97      0.96       171
weighted avg       0.97      0.96      0.97       171



In [8]:
X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print('Target class:', np.unique(y_train))

(1257, 64)
(1257,)
(540, 64)
(540,)
Target class: [0 1 2 3 4 5 6 7 8 9]


In [12]:
model4 = GradientBoosting(n_estimators=200, max_depth=4, min_sample_split=10, learning_rate=0.1, isClassification=True)
tree = model4.fit(X_train, y_train)
yhat = model4.predict(X_test, tree, return_argmax=True)
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        53
           1       0.94      0.96      0.95        50
           2       1.00      0.96      0.98        47
           3       0.96      0.93      0.94        54
           4       0.97      0.98      0.98        60
           5       0.90      0.97      0.93        66
           6       0.98      0.96      0.97        53
           7       0.93      0.98      0.96        55
           8       0.89      0.93      0.91        43
           9       0.95      0.88      0.91        59

   micro avg       0.95      0.95      0.95       540
   macro avg       0.95      0.95      0.95       540
weighted avg       0.95      0.95      0.95       540

