# Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [4]:
PATH = "data/demand-forecasting/"

# Helper Functions

In [5]:
# custom evaluation metric
def SMAPE(y_pred, dtrain):
    y_true = dtrain.get_label()
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 'SMAPE', np.nanmean(diff)

In [6]:
def smape2(y_pred, y_true):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.nanmean(diff)

In [7]:
def print_scores(pred,label):
    rmse = np.sqrt(mean_squared_error(pred,label))
    mae = mean_absolute_error(pred,label)
    smape_score = smape2(pred,label)
    
    print('RMSE\t\t ' + str(rmse))
#     print('MAE\t' + str(mae))
    print('SMAPE\t\t' + str(smape_score))

In [8]:
def plot_learning_curve(results,trn,val,metric):
    train_errors = results[trn][metric]
    validation_errors = results[val][metric]
    df = pd.DataFrame([train_errors, validation_errors]).T
    df.columns = ['Training', 'Validation']
    df.index.name = 'Boosting Round'
    ax = df.plot(title="XGBoost learning curves",figsize=(12,5))
    ax.set_ylabel(metric)

# Pre-Process Data

In [9]:
train = pd.read_csv(f'{PATH}train.csv', parse_dates=['date'])
test = pd.read_csv(f'{PATH}test.csv', parse_dates=['date'], index_col='id')

In [10]:
train['y'] = train['date'].dt.year
train['m'] = train['date'].dt.month
train['d'] = train['date'].dt.day
train['dw'] = train['date'].dt.dayofweek
train['dy'] = train['date'].dt.dayofyear
train.drop('date', axis=1, inplace=True)

sales = train.pop('sales')

In [11]:
test['y'] = test['date'].dt.year
test['m'] = test['date'].dt.month
test['d'] = test['date'].dt.day
test['dw'] = test['date'].dt.dayofweek
test['dy'] = test['date'].dt.dayofyear
test.drop('date', axis=1, inplace=True)

# Split Training-Validation Data

In [12]:
train.shape, sales.shape, test.shape

((913000, 7), (913000,), (45000, 7))

In [13]:
# closest same period (diff year) as test set
X_val = train.loc[(train.y==2017) & ((train.m==10) | (train.m==11) | (train.m==12))].copy() 
y_val = sales[X_val.index].copy()

X_train = train.drop(X_val.index).copy()
y_train = sales.drop(X_val.index).copy()

In [14]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((867000, 7), (867000,), (46000, 7), (46000,))

In [15]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)
print()




# XGBoost

In [16]:
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_val = xgb.DMatrix(data=X_val, label=y_val)
DM_test = xgb.DMatrix(data=test)

In [17]:
evals_result = {}
watchlist = [(DM_train, "training"), (DM_val, "validation")]

In [18]:
params_native = {
    'objective': 'reg:linear', 
    'booster':'gbtree',        
    'silent': 1,               
    'eta': 0.03,                 
    'gamma': 0,                 
    'max-depth': 3,             
    'min_child_weight': 1,      
    'max_delta_step': 0,        
    'subsample': 0.9,             
    'colsample_bytree': 0.7,      
    'colsample_bylevel': 0.9,     
    'lambda': 0.9,                
    'alpha': 0,                 
    'scale_pos_weight': 1,      
    'base_score': 0.5,          
    'eval_metric':'rmse',      
    'seed': 42                 
}

## Train with partial data

In [19]:
%%time
xgb_native = xgb.train(params=params_native, 
                            dtrain=DM_train,
                            num_boost_round=100_000,
                            evals=watchlist,
                            early_stopping_rounds=20,
                            evals_result=evals_result,
                            verbose_eval=100)

# feval=SMAPE,
# maximize=False,

[0]	training-rmse:57.6926	validation-rmse:59.8999
Multiple eval metrics have been passed: 'validation-rmse' will be used for early stopping.

Will train until validation-rmse hasn't improved in 20 rounds.
[100]	training-rmse:17.7973	validation-rmse:18.623
[200]	training-rmse:12.9664	validation-rmse:13.4753
[300]	training-rmse:10.9464	validation-rmse:11.3497
[400]	training-rmse:9.80133	validation-rmse:10.1634
[500]	training-rmse:9.06775	validation-rmse:9.39154
[600]	training-rmse:8.56907	validation-rmse:8.89444
[700]	training-rmse:8.29426	validation-rmse:8.61039
[800]	training-rmse:8.08337	validation-rmse:8.39634
[900]	training-rmse:7.94674	validation-rmse:8.26906
[1000]	training-rmse:7.81299	validation-rmse:8.14415
[1100]	training-rmse:7.73667	validation-rmse:8.07449
[1200]	training-rmse:7.67638	validation-rmse:8.02228
[1300]	training-rmse:7.61857	validation-rmse:7.96974
[1400]	training-rmse:7.56813	validation-rmse:7.92546
[1500]	training-rmse:7.53372	validation-rmse:7.89954
[1600]	tra

In [21]:
print("Best iteration\t{}".format(xgb_native.best_iteration))
print("Best tree limit\t{}".format(xgb_native.best_ntree_limit))
print("Best RMSE score\t{}".format(xgb_native.best_score))

print("\nValidation score")
pred_val = xgb_native.predict(DM_val)
print_scores(pred_val, y_val)

print("\nTraining score")
pred_train = xgb_native.predict(DM_train)
print_scores(pred_train, y_train)

Best iteration	2276
Best tree limit	2277
Best RMSE score	7.742908

Validation score
RMSE		 7.742944643378778
SMAP		12.60422749642427

Training score
RMSE		 7.349925806207827
SMAP		12.780502659751425


## Loop with range of values

In [None]:
# default baseline reference, do not edit
params_native_default = {
    'objective': 'reg:linear',  # 'reg:linear'
    'booster':'gbtree',         # 'gbtree'
    'silent': 1,                # 0   
    'eta': 0.1,                 # 0.3 alias learning_rate
    'gamma': 0,                 # 0
    'max-depth': 3,             # 6
    'min_child_weight': 1,      # 1
    'max_delta_step': 0,        # 0
    'subsample': 1,             # 1
    'colsample_bytree': 1,      # 1
    'colsample_bylevel': 1,     # 1
    'lambda': 1,                # 1  
    'alpha': 0,                 # 0
    'scale_pos_weight': 1,      # 1
    'base_score': 0.5,          # 0.5
    'eval_metric':'rmse',       # 'rmse' for regression
    'seed': 42                  # 0 
}

In [None]:
tuning_dict = { 
    'eta':[0.3,0.05],
    'max-depth':[4,5,6,7],
    'subsample':[0.9,0.8,0.7],
    'colsample_bytree':[0.9,0.8,0.7],
    'lambda': [0.9,0.8,0.7],              
    'alpha': [0.1,0.2],
    'gamma': [0.1,0.2],
    'colsample_bylevel': [0.9,0.8,0.7,0.6]
}

In [None]:
i=1

for key,val_list in tuning_dict.items():
    for val in val_list:
        
        print(f"[[ Iteration {i} - Tuning '{key}' to {val} ]]")
        i+=1
        
        params_native_copy = params_native_default.copy()
        params_native_copy[key] = val

        xgb_native_loop = xgb.train(params=params_native_copy, 
                                    dtrain=DM_train,
                                    num_boost_round=100_000,
                                    evals=watchlist,
                                    early_stopping_rounds=20,
                                    evals_result=evals_result,
                                    verbose_eval=100)

        print("\n")
        print("Best iteration\t{}".format(xgb_native_loop.best_iteration))
        print("Best tree limit\t{}".format(xgb_native_loop.best_ntree_limit))
        print("Best RMSE score\t{}".format(xgb_native_loop.best_score))

        print("\nValidation score")
        pred_val = xgb_native_loop.predict(DM_val)
        print_scores(pred_val, y_val)

        print("\nTraining score")
        pred_train = xgb_native_loop.predict(DM_train)
        print_scores(pred_train, y_train)
        
        print("\n\n")

## Predict

In [None]:
y_pred = xgb_native.predict(DM_test, ntree_limit = xgb_native.best_ntree_limit)

# Train with full data based on tuned parameters

In [None]:
%%time
full_data = xgb.DMatrix(data=train, label=sales)
xgb_native_full_data = xgb.train(params=params_native, 
                                dtrain=full_data,
                                num_boost_round=1501,
                                evals=[(full_data, "training")],
                                verbose_eval=100)

## Predict

In [None]:
y_pred = xgb_native_full_data.predict(DM_test)

# Submit

In [None]:
submission = pd.read_csv(f'{PATH}test.csv', index_col='id')

In [None]:
submission['sales'] = y_pred

In [None]:
csv_fn = f'{PATH}tmp/XGB_v3_partial.csv'

In [None]:
submission[['sales']].to_csv(csv_fn)

__KAGGLE SCORE: __
- 14.27247 (with val)
- 14.25877 (full with 1500 boost)

#### Dealing with high variance
If model is too complex try:
- using less features (ie. feature selection),
- using more training samples (ie. artificially generated),
- increasing regularization (add penalties for extra complexity)

In XGBoost you can try to:
- reduce depth of each tree (`max_depth`),
- increase `min_child_weight` parameter,
- increase `gamma` parameter,
- add more randomness using `subsample`, `colsample_bytree` parameters,
- increase `lambda` and `alpha` regularization parameters

#### Dealing with high bias
If model is too simple:
- add more features (ie. better feature engineering),
- more sophisticated model
- decrease regularization

In XGBoost you can do it by:
- increase depth of each tree (`max_depth`),
- decrease `min_child_weight` parameter,
- decrease `gamma` parameter,
- decrease `lambda` and `alpha` regularization parameters

Let's try to tweak a parameters a little bit. We are going to add some randomness - each tree we will use 70% randomly chosen samples and 60% randomly chosen features. This should help to reduce a variance. To decrease the bias (bigger accuracy) try adding an extra level to each tree.

#### Dealing with high variance
If model is too complex try:
- using less features (ie. feature selection),
- using more training samples (ie. artificially generated),
- increasing regularization (add penalties for extra complexity)

In XGBoost you can try to:
- reduce depth of each tree (`max_depth`),
- increase `min_child_weight` parameter,
- increase `gamma` parameter,
- add more randomness using `subsample`, `colsample_bytree` parameters,
- increase `lambda` and `alpha` regularization parameters

#### Dealing with high bias
If model is too simple:
- add more features (ie. better feature engineering),
- more sophisticated model
- decrease regularization

In XGBoost you can do it by:
- increase depth of each tree (`max_depth`),
- decrease `min_child_weight` parameter,
- decrease `gamma` parameter,
- decrease `lambda` and `alpha` regularization parameters

Let's try to tweak a parameters a little bit. We are going to add some randomness - each tree we will use 70% randomly chosen samples and 60% randomly chosen features. This should help to reduce a variance. To decrease the bias (bigger accuracy) try adding an extra level to each tree.