# Gradient Boosting
(by Tevfik Aytekin)

In [77]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
import xgboost as xgb

def kaggle_score(y_true,y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred));
#def kaggle_score(y_true,y_pred):
#    return np.sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)));
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [4]:
house_train = pd.read_csv("../datasets/house_prices/train.csv")


### DecisionTreeRegressor
Run DecisionTreeRegressor on House Prices.

In [5]:
X = house_train.loc[:,'MSSubClass':'SaleCondition']
y = house_train.loc[:,'SalePrice']
X = pd.get_dummies(X)
mae, kaggle, mape = [], [], []
for i in range(1,10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_test.mean())

    regr = DecisionTreeRegressor()
    regr.fit(X_train, y_train)
    
    test_predictions = regr.predict(X_test)
    mae.append(mean_absolute_error(y_test, test_predictions))
    mape.append(mean_absolute_percentage_error(y_test, test_predictions))
    kaggle.append(kaggle_score(y_test, test_predictions))

print("Test MAE:", np.mean(mae))
print("Test MAPE:", np.mean(mape))
print("Test Kaggle:", np.mean(kaggle))


Test MAE: 28521.569254185695
Test MAPE: 15.568382942404535
Test Kaggle: 0.21400016507169195


### GradientBoostingRegressor
Run sklearn's GradientBoostingRegressor on House Prices dataset

In [6]:
X = house_train.loc[:,'MSSubClass':'SaleCondition']
y = house_train.loc[:,'SalePrice']
X = pd.get_dummies(X)
mae, kaggle, mape = [], [], []
for i in range(1,10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_test.mean())

    regr = GradientBoostingRegressor()
    regr.fit(X_train, y_train)
    
    test_predictions = regr.predict(X_test)
    mae.append(mean_absolute_error(y_test, test_predictions))
    mape.append(mean_absolute_percentage_error(y_test, test_predictions))
    kaggle.append(kaggle_score(y_test, test_predictions))

print("Test MAE:", np.mean(mae))
print("Test MAPE:", np.mean(mape))
print("Test Kaggle:", np.mean(kaggle))


Test MAE: 16451.61231456698
Test MAPE: 9.25024550760628
Test Kaggle: 0.1293182192154727


### GradientBoostingRegressor from scratch
Let us write GradientBoostingRegressor from scratch

In [78]:
class MyGradientBoostingRegressor:
    
    def __init__(self, n_estimators = 10, shrinkage = 0.1):
        self.models = []
        self.n_estimators = n_estimators
        self.shrinkage = shrinkage
    def calc_grads(self, model, X, y):    
        preds = self.shrinkage * model.predict(X)
        grads = y - preds
        return grads
    def predict(self, X):
        preds = np.zeros(X.shape[0])
        for m in self.models:
            preds += self.shrinkage * m.predict(X)
        return preds
        
    def fit(self, X, y):
        for i in range(self.n_estimators):
            model = DecisionTreeRegressor(max_depth=2);
            if (i == 0):
                model.fit(X, y)
                grads = self.calc_grads(model, X, y)
            else:
                model.fit(X, grads)
                grads = self.calc_grads(model, X, grads)
            self.models.append(model)
            
            

### GradientBoostingRegressor from scratch
Now let us run our version of GradientBoostingRegressor on the same dataset

In [79]:
X = house_train.loc[:,'MSSubClass':'SaleCondition']
y = house_train.loc[:,'SalePrice']
X = pd.get_dummies(X)
mae, kaggle, mape = [], [], []
for i in range(1,10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_test.mean())

    regr = MyGradientBoostingRegressor(n_estimators=100, shrinkage=0.1)
    regr.fit(X_train, y_train)
    
    test_predictions = regr.predict(X_test)
    mae.append(mean_absolute_error(y_test, test_predictions))
    mape.append(mean_absolute_percentage_error(y_test, test_predictions))
    kaggle.append(kaggle_score(y_test, test_predictions))


print("Test MAE:", np.mean(mae))
print("Test MAPE:", np.mean(mape))
print("Test Kaggle:", np.mean(kaggle))

Test MAE: 16634.622608431117
Test MAPE: 9.683086170975127
Test Kaggle: 0.1301372397573366


### Gradient Boosting Algorithm (Gradient Descent in the Function Space)

Gradient boosting makes gradient descent on the prediction values. If we use L2 loss then the loss function becomes:

$$ \sum_{i=1}^n(y_i -\hat{y}_i)^2$$ 

Note that in the above equation we know the actual $y$ values. Suppose that we start at some random fixed value (say 0) for all $\hat{y}_i$'s. Be careful to note that here we are treating $\hat{y}_i$'s as variables. In order to minimize the above loss function which direction should we go for each $\hat{y}_i$. In other words, should we decrease or increase the values of $\hat{y}_i$'s. We know that we can answer this question by looking at the gradient. That is, the update rule should be like this:

$$ \hat{y}_i = \hat{y}_i - \alpha \frac{\partial \sum_{i=1}^n(y_i -\hat{y}_i)^2 }{\partial \hat{y}_i }$$ 
which is equal to
$$ \hat{y}_i = \hat{y}_i + \alpha(y_i -\hat{y}_i)$$ 

where $\alpha$ is the learning rate (or shrinkage). The somewhat counter intuitive thing we will do now is to learn the amount of update (that is, $ \alpha(y_i -\hat{y}_i)$ with a base learner.

This treatment is a bit confusing. For further reading, [here](https://explained.ai/gradient-boosting/) is a nice exposition of gradient boosting.


In [80]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

class LeastSquares:
    @staticmethod
    def gradient(y, p):
        return -(y - p)
    def loss(self, y, p):
        return 0.5 * np.power((y - p), 2)


class My2GradientBoostingRegressor:

    def __init__(self, shrinkage=0.1, loss=LeastSquares(), n_estimators=100):
        self.shrinkage = shrinkage
        self.loss = loss
        self.n_estimators = n_estimators
        self.models = []

    def predict(self, X):
        if (len(self.models)>0):
            return -sum(m.predict(X) for m in self.models)
        else:
            return 0
        
    def fit(self, X, y):
        for m in range(self.n_estimators):
            preds = self.predict(X)
            gradients = self.loss.gradient(y, preds)
            tree = DecisionTreeRegressor(max_depth=2)
            tree.fit(X, self.shrinkage*gradients)
            self.models.append(tree)

In [81]:
from sklearn.metrics import mean_absolute_error
X = house_train.loc[:,'MSSubClass':'SaleCondition']
y = house_train.loc[:,'SalePrice']
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

reg1 = DecisionTreeRegressor()
reg2 = GradientBoostingRegressor()
reg3 = My2GradientBoostingRegressor(shrinkage=0.2)

reg1.fit(X_train, y_train)
reg2.fit(X_train, y_train)
reg3.fit(X_train, y_train)

print(mean_absolute_percentage_error(y_test, reg1.predict(X_test)))
print(mean_absolute_percentage_error(y_test, reg2.predict(X_test)))
print(mean_absolute_percentage_error(y_test, reg3.predict(X_test)))

13.989232929640268
8.470903278093923
8.642362455248106


### Gradient Boosting Classifier

In [82]:
# Bank Marketing Dataset from
# https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

bank = pd.read_csv("../datasets/bank/bank-full.csv", delimiter = ";")
# print first 5 examples
bank.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [17]:
bank.y.value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [18]:
bank_majority = bank[bank.y=="no"]
bank_minority = bank[bank.y=="yes"]
 
# downsample
bank_majority_downsampled = resample(bank_majority, 
                                 replace=False,    
                                 n_samples=5289) 
 
bank_balanced = pd.concat([bank_minority, bank_majority_downsampled])
bank_balanced.y.value_counts()

yes    5289
no     5289
Name: y, dtype: int64

In [89]:

class CrossEntropy:

    def loss(self, y, p):
        return - y * np.log(p) - (1 - y) * np.log(1 - p)
    
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))    

    def gradient(self, y, p):
        #return y - p
        return (y / p) - (1 - y) / (1 - p)

class MyGradientBoostingClassifier:

    def __init__(self, shrinkage=0.1, loss=CrossEntropy(), n_estimators=100):
        self.shrinkage = shrinkage
        self.loss = loss
        self.n_estimators = n_estimators
        self.models = []
        
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))    

    def cutoff(self, x):
        if (x > 0.5):
            return 1
        else:
            return 0
        
    def predict(self, X):
        return [self.cutoff(y) for y in self.predict_raw(X)]
        
    def predict_raw(self, X):
        if (len(self.models)>0):
            return self.sigmoid(sum(m.predict(X) for m in self.models))
        else:
            return 0.5
        
     
    def fit(self, X, y):
        for m in range(self.n_estimators):
            preds = self.predict_raw(X)
            #print(preds)
            gradients = self.loss.gradient(y, preds)
            #print(gradients)
            tree = DecisionTreeRegressor(max_depth=2)
            tree.fit(X, self.shrinkage*gradients)
            self.models.append(tree)

In [90]:
X = bank_balanced.loc[:,'age':'poutcome']
y = bank_balanced.loc[:,'y']
y = y.replace(["yes","no"],[1,0])
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())
 
clf1 = DecisionTreeClassifier()
clf2 = GradientBoostingClassifier()
clf3 = MyGradientBoostingClassifier()

clf1.fit(X_train, y_train);
clf2.fit(X_train, y_train);
clf3.fit(X_train, y_train);

y_pred1 = clf1.predict(X_test)  
y_pred2 = clf2.predict(X_test) 
y_pred3 = clf3.predict(X_test) 

print(classification_report(y_test,y_pred1))
print(classification_report(y_test,y_pred2))
print(classification_report(y_test,y_pred3))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80       542
           1       0.79      0.78      0.79       516

    accuracy                           0.79      1058
   macro avg       0.79      0.79      0.79      1058
weighted avg       0.79      0.79      0.79      1058

              precision    recall  f1-score   support

           0       0.88      0.85      0.86       542
           1       0.85      0.88      0.86       516

    accuracy                           0.86      1058
   macro avg       0.86      0.86      0.86      1058
weighted avg       0.86      0.86      0.86      1058

              precision    recall  f1-score   support

           0       0.84      0.83      0.84       542
           1       0.82      0.84      0.83       516

    accuracy                           0.83      1058
   macro avg       0.83      0.83      0.83      1058
weighted avg       0.83      0.83      0.83      1058



### XGBoost Regression

In [91]:

X = house_train.loc[:,'MSSubClass':'SaleCondition']
y = house_train.loc[:,'SalePrice']
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

reg1 = DecisionTreeRegressor()
reg2 = GradientBoostingRegressor()
reg3 = My2GradientBoostingRegressor()
reg4 = xgb.XGBRegressor()


reg1.fit(X_train, y_train)
reg2.fit(X_train, y_train)
reg3.fit(X_train, y_train)
reg4.fit(X_train, y_train)

print(mean_absolute_percentage_error(y_test, reg1.predict(X_test)))
print(mean_absolute_percentage_error(y_test, reg2.predict(X_test)))
print(mean_absolute_percentage_error(y_test, reg3.predict(X_test)))
print(mean_absolute_percentage_error(y_test, reg4.predict(X_test)))

16.14965159573065
9.265885655738185
9.937858315046913
8.516827668811349


XGBoost can handle missing values.

In [86]:
X = house_train.loc[:,'MSSubClass':'SaleCondition']
y = house_train.loc[:,'SalePrice']
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

reg4 = xgb.XGBRegressor()
reg4.fit(X_train, y_train)
print(mean_absolute_percentage_error(y_test, reg4.predict(X_test)))

10.231988732557712


### XGBoost Classification

In [87]:
X = bank_balanced.loc[:,'age':'poutcome']
y = bank_balanced.loc[:,'y']
y = y.replace(["yes","no"],[1,0])
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())
 
clf1 = DecisionTreeClassifier()
clf2 = GradientBoostingClassifier()
clf3 = MyGradientBoostingClassifier()
clf4 = xgb.XGBClassifier()

clf1.fit(X_train, y_train);
clf2.fit(X_train, y_train);
clf3.fit(X_train, y_train);
clf4.fit(X_train, y_train);

y_pred1 = clf1.predict(X_test)  
y_pred2 = clf2.predict(X_test) 
y_pred3 = clf3.predict(X_test)
y_pred4 = clf4.predict(X_test) 

print(classification_report(y_test,y_pred1))
print(classification_report(y_test,y_pred2))
print(classification_report(y_test,y_pred3))
print(classification_report(y_test,y_pred4))



              precision    recall  f1-score   support

           0       0.80      0.78      0.79       538
           1       0.78      0.80      0.79       520

    accuracy                           0.79      1058
   macro avg       0.79      0.79      0.79      1058
weighted avg       0.79      0.79      0.79      1058

              precision    recall  f1-score   support

           0       0.87      0.80      0.84       538
           1       0.81      0.88      0.84       520

    accuracy                           0.84      1058
   macro avg       0.84      0.84      0.84      1058
weighted avg       0.84      0.84      0.84      1058

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       538
           1       0.78      0.78      0.78       520

    accuracy                           0.79      1058
   macro avg       0.79      0.79      0.79      1058
weighted avg       0.79      0.79      0.79      1058

              preci

In [88]:
X = bank_balanced.loc[:,'age':'poutcome']
y = bank_balanced.loc[:,'y']
y = y.replace(["yes","no"],[1,0])
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf4 = xgb.XGBClassifier()
clf4.fit(X_train, y_train);
y_pred4 = clf4.predict(X_test) 
print(classification_report(y_test,y_pred4))





              precision    recall  f1-score   support

           0       0.89      0.85      0.87       522
           1       0.86      0.90      0.88       536

    accuracy                           0.87      1058
   macro avg       0.87      0.87      0.87      1058
weighted avg       0.87      0.87      0.87      1058

