### ===Task===

Modify the above scratch code such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

In [1]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
import numpy as np


In [22]:
class GBoosting:
    def __init__(self, num_stumps=5, learning_rate=1, max_depth = 1, 
                 min_samples_split = 2,
                 classification=False):
        self.num_stumps = num_stumps
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.classification=classification
            
        #initialize regression trees
        self.model_list = [DecisionTreeRegressor(max_depth =self.max_depth,
                                                 min_samples_split = self.min_samples_split ) for _ in range(num_stumps)]  
        #inserting a dumb model for the first model
        first_model = DummyRegressor(strategy='median')
        self.model_list.insert(0, first_model)
        
    def grad(self, y, yhat):
        return y - yhat
    
    def softmax(self, z):
        s = np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
        return s
    
    def fit(self, X_train, y_train):  #<----X_train
        
        #fit the first model
        self.model_list[0].fit(X_train, y_train)
        
        for i in range(self.num_stumps):
            #predict
            yhat = self.predict(X_train, self.model_list[:i+1],
                                with_argmax=False)
            
            #get the gradient
            gradient = self.grad(y_train, yhat)
            
            #fit the next model with gradient
            self.model_list[i+1].fit(X_train, gradient)
    
    def predict(self, X, model_list=None, with_argmax=True):
        if model_list is None:
            model_list = self.model_list
            
        h0 = model_list[0].predict(X)  #first use the dummy model
        boosting = sum(self.learning_rate * model.predict(X) 
                       for model in model_list[1:])
        yhat = h0 + boosting
        
        #for the classification part
        if (self.classification == True):
            
            #turn into probability using softmax
            yhat = self.softmax(yhat)
            #np.exp(yhat) / np.sum(np.exp(yhat), axis=1, keepdims=True)
            
            if with_argmax:
                yhat = np.argmax(yhat, axis=1)
        return yhat

In [23]:
# Regression
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                        test_size=0.3, random_state=42)


model1 = GBoosting(num_stumps=200, learning_rate=0.1, max_depth = 1, 
                 min_samples_split = 2
                 )
model1.fit(X_train, y_train)
yhat = model1.predict(X_test)

#print metrics
print("MSE for model 1: ", mean_squared_error(y_test, yhat))


model2 = GBoosting(num_stumps=200, learning_rate=0.1, max_depth = 3, 
                 min_samples_split = 3
                 )
model2.fit(X_train, y_train)
yhat = model2.predict(X_test)

#print metrics
print("MSE for model 2: ", mean_squared_error(y_test, yhat))



MSE for model 1:  12.94555760162601
MSE for model 2:  8.287219801101964


We can see the MSE is decreased by increasing the max depth and min_sample _split in the model

In [24]:
# Binary classification

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

#one hot encoding the y
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))

for class_col in range(len(set(y))):
    y_train_encoded[np.where(y_train==class_col), class_col] = 1
    

model = GBoosting(num_stumps=200, learning_rate=0.1, max_depth = 3, 
                 min_samples_split = 2,
                 classification=True)
model.fit(X_train, y_train_encoded)
yhat = model.predict(X_test)

# #print metrics
print("Binary classification accuracy: ", accuracy_score(y_test, yhat))

Binary classification accuracy:  0.9736842105263158


In [25]:
# Multiclass classification

from sklearn.datasets import load_digits 
#loading the digits database

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)


#one hot encoding the y
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))


for class_col in range(len(set(y))):
    y_train_encoded[np.where(y_train==class_col), class_col] = 1

model = GBoosting(num_stumps=200, learning_rate=0.1, max_depth = 3, 
                 min_samples_split = 2,
                 classification=True)
model.fit(X_train, y_train_encoded)
yhat = model.predict(X_test)

# #print metrics
print("Multiclass classifcation accuracy: ", accuracy_score(y_test, yhat))

Multiclass classifcation accuracy:  0.9333333333333333
