In [212]:
# https://explained.ai/gradient-boosting/L2-loss.html

In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random

In [33]:
import seaborn as sns
tips = sns.load_dataset('tips')

In [34]:
tips.shape

(244, 7)

In [54]:
def convert_col_one_hot_series(series, drop_first=False, method='get_dummies'):
    one_hot = pd.get_dummies(series, prefix=(str(series.name)), drop_first=drop_first)
    return one_hot

def replace_col_with_one_hot(df, col, drop_column=False, drop_first=False, method='get_dummies'):

    # Get one-hot encoding
    one_hot = convert_col_one_hot_series(df[col])

    if drop_column:
        df = df.drop(col,axis = 1)
    
    # Join the encoded df
    df = df.join(one_hot)

    return df

def MSE(y, yHat):
    return ((y-yHat)**2).sum()

In [36]:
print tips['sex'].value_counts()
print tips['smoker'].value_counts()
print tips['day'].value_counts()
print tips['time'].value_counts()

Male      157
Female     87
Name: sex, dtype: int64
No     151
Yes     93
Name: smoker, dtype: int64
Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64
Dinner    176
Lunch      68
Name: time, dtype: int64


In [37]:
tips = replace_col_with_one_hot(tips, 'sex', True)
tips = replace_col_with_one_hot(tips, 'smoker', True)
tips = replace_col_with_one_hot(tips, 'day', True)
tips = replace_col_with_one_hot(tips, 'time', True)

In [38]:
# Test Train Split
Y = tips['tip']
X = tips.loc[:, ~tips.columns.isin(['tip'])]
X_train, X_valid, Y_train, Y_valid, idx1, idx2 = train_test_split(
            X, 
            Y, 
            np.arange(X.shape[0]), 
            test_size = 0.2, 
            random_state = 111
        )

### Basic Regression Tree

In [56]:
# Basic Regression Tree
from sklearn import tree
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, Y_train)

preds = clf.predict(X_valid)
print 'MSE:'
print MSE(Y_valid, preds)

MSE:
64.05310000000001


In [70]:
print X_train.head(3)
print Y_train.head(3)
# X_train.append(Y_train)
ll = pd.concat([X_train, Y_train], axis=1)

    total_bill  size  sex_Male  sex_Female  smoker_Yes  smoker_No  day_Thur  \
6         8.77     2         1           0           0          1         0   
44       30.40     4         1           0           0          1         0   
3        23.68     2         1           0           0          1         0   

    day_Fri  day_Sat  day_Sun  time_Lunch  time_Dinner  
6         0        0        1           0            1  
44        0        0        1           0            1  
3         0        0        1           0            1  
6     2.00
44    5.60
3     3.31
Name: tip, dtype: float64


In [79]:
# Y_train.name
ll.sample(n=ll.shape[0], replace=True)

yy = ll['tip']
xx = ll.loc[:, ~ll.columns.isin(['tip'])]



### Basic Random Forest

In [157]:
class RF_Regressor(object):
    def __init__(self, numLearners=10):
        self.learners = []
        self.numLearners = numLearners
    
    def fit(self, X_train, Y_train, RANDOMIZE=True):
        
        Xtemp = pd.concat([X_train, Y_train], axis=1)
        
        # Train learners till numLearners and store them in self.learners
        for i in range(self.numLearners):
            tempModel = {}        
            
            # Decorrelate trees by sampling with replacement rows (bootstraping) + sampling columns
            # Sample rows with replacement
            if RANDOMIZE:
                Xtemp = Xtemp.sample(n=Xtemp.shape[0], replace=True)
            else:
                Xtemp = Xtemp
            tempX = Xtemp.loc[:, ~Xtemp.columns.isin([Y_train.name])]
            tempY = Xtemp[Y_train.name]
            # Sample columns
            if RANDOMIZE:
                randomCols = random.sample(tempX.columns, random.randint(len(tempX.columns)/2, len(tempX.columns)))
            else:
                randomCols = tempX.columns
                
            xTempRandomized = tempX[randomCols]
            
            tempModel['cols'] = randomCols
            tempModel['model'] = tree.DecisionTreeRegressor()

            # Train model
            tempModel['model'] = tempModel['model'].fit(xTempRandomized, tempY)
            self.learners.append(tempModel)

    def predict(self, X_valid, Y_valid):
        
        mse = 0
        # Predict from each trained model
        for model in self.learners:
            preds = model['model'].predict(X_valid[model['cols']])
            tempMSE = MSE(Y_valid, preds)
#             print tempMSE
            mse += tempMSE
        return mse/float(len(self.learners))

In [158]:
rf = RF_Regressor()
rf.fit(X_train, Y_train, True)

print 'MSE:'
print rf.predict(X_valid, Y_valid)

# RF usually better for generalization purposes when the data is such that normal trees suffer from overfitting
# Here, its a simple situation and decision trees are relatively stable, hence agg through RF not providing much value prop

MSE:
109.77722528726682


### Basic Gradient Boosting

In [253]:
class GBT_Regressor(object):
    def __init__(self, numLearners=10):
        self.learners = []
        self.numLearners = numLearners
        self.alpha = .66
    
    def fit(self, X_train, Y_train, RANDOMIZE=False):
        
        self.learners.append({'yPreds':pd.Series([0]*Y_train.shape[0])})
        newY = Y_train.reset_index(drop=True)
        
        # Train learners till numLearners and store them in self.learners
        for i in range(1, self.numLearners):
#             newY = Y_train.reset_index(drop=True) - self.learners[i-1]['yPreds']
            newY = newY - self.alpha*self.learners[i-1]['yPreds']
            tempModel = {}
            
#             tempModel['cols'] = randomCols
            tempModel['model'] = tree.DecisionTreeRegressor()

            # Train model
            tempModel['model'] = tempModel['model'].fit(X_train, newY)
            tempModel['yPreds'] = tempModel['model'].predict(X_train)
            
            self.learners.append(tempModel)

    def predict(self, X_valid, Y_valid):
        
        yPred = pd.Series([0.0]*Y_valid.shape[0]).values
        # Predict from each trained model
        for i in range(1, len(self.learners)):
            model = self.learners[i]
            preds = model['model'].predict(X_valid)
            yPred += self.alpha*preds
        return MSE(Y_valid, yPred)
#         return mse/float(len(self.learners))

In [258]:
gbt = GBT_Regressor(50)
gbt.fit(X_train, Y_train, True)

# print 'MSE:'
print gbt.predict(X_valid, Y_valid)



66.903430240241
