# Boiler Plate

Following the schema at https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html

In [1]:
import numpy as np
import xgboost as xgb
from typing import Tuple
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def msle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    ''' Mean squared log error metric.'''
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'MSLE', float(np.mean(elements))

def evaluate(hessian):
    def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
        '''Compute the gradient squared log error.'''
        y = dtrain.get_label()
        return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

    def squared_log(predt: np.ndarray,
                    dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
        '''Squared Log Error objective. A simplified version for RMSLE used as
        objective function.
        '''
        predt[predt < -1] = -1 + 1e-6
        grad = gradient(predt, dtrain)
        hess = hessian(predt, dtrain)
        return grad, hess
    
    results = {}

    xgb.train({'tree_method': 'hist', 'seed': 1994,
               'disable_default_eval_metric': 1},
              dtrain=dtrain,
              num_boost_round=10,
              obj=squared_log,
              feval=msle,
              evals=[(dtrain, 'dtrain'), (dtest, 'dtest')],
              evals_result=results)
    return results

In [3]:
X = pd.read_csv('data/housesalesprediction/kc_house_data.csv')

y = np.array(X['price'])

X.drop(columns=['id', 'date', 'price', 'zipcode', 'lat', 'long', 'sqft_living15',
       'sqft_lot15'], inplace=True) # the last once just to keep training faster

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Approximation using Taylor expension

In [4]:
def hessian_taylor(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((-np.log1p(predt) + np.log1p(y) + 1) /
            np.power(predt + 1, 2))

In [5]:
taylor = evaluate(hessian_taylor)

[0]	dtrain-MSLE:153.897	dtest-MSLE:154.235
[1]	dtrain-MSLE:147.884	dtest-MSLE:148.216
[2]	dtrain-MSLE:141.999	dtest-MSLE:142.324
[3]	dtrain-MSLE:136.242	dtest-MSLE:136.561
[4]	dtrain-MSLE:130.612	dtest-MSLE:130.925
[5]	dtrain-MSLE:125.11	dtest-MSLE:125.416
[6]	dtrain-MSLE:119.735	dtest-MSLE:120.034
[7]	dtrain-MSLE:114.487	dtest-MSLE:114.78
[8]	dtrain-MSLE:109.367	dtest-MSLE:109.654
[9]	dtrain-MSLE:104.374	dtest-MSLE:104.655


# Quadratic Apprixmation 1

In [6]:
def hessian_approx1(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((np.log1p(predt) - np.log1p(y)) /
            ((predt+1)*(predt-y)))

In [7]:
approx1 = evaluate(hessian_approx1)

[0]	dtrain-MSLE:7.35528	dtest-MSLE:7.4356
[1]	dtrain-MSLE:7.35521	dtest-MSLE:7.43553
[2]	dtrain-MSLE:7.35514	dtest-MSLE:7.43546
[3]	dtrain-MSLE:7.35507	dtest-MSLE:7.43539
[4]	dtrain-MSLE:7.355	dtest-MSLE:7.43532
[5]	dtrain-MSLE:7.35493	dtest-MSLE:7.43525
[6]	dtrain-MSLE:7.35486	dtest-MSLE:7.43518
[7]	dtrain-MSLE:7.35479	dtest-MSLE:7.43511
[8]	dtrain-MSLE:7.35472	dtest-MSLE:7.43504
[9]	dtrain-MSLE:7.35465	dtest-MSLE:7.43497


# Quadratic Apprixmationm 2

In [8]:
def hessian_approx2(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return (np.power(np.log1p(predt) - np.log1p(y), 2)/
            np.power(predt - y, 2))

In [9]:
approx2 = evaluate(hessian_approx2)

[0]	dtrain-MSLE:5.84439	dtest-MSLE:5.91674
[1]	dtrain-MSLE:5.84437	dtest-MSLE:5.91671
[2]	dtrain-MSLE:5.84434	dtest-MSLE:5.91668
[3]	dtrain-MSLE:5.8443	dtest-MSLE:5.91664
[4]	dtrain-MSLE:5.84427	dtest-MSLE:5.91661
[5]	dtrain-MSLE:5.84424	dtest-MSLE:5.91659
[6]	dtrain-MSLE:5.84421	dtest-MSLE:5.91655
[7]	dtrain-MSLE:5.84418	dtest-MSLE:5.91652
[8]	dtrain-MSLE:5.84415	dtest-MSLE:5.9165
[9]	dtrain-MSLE:5.84412	dtest-MSLE:5.91646
