# Training with XGBoost

XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable.

In [None]:
import os
import logging
import json

import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from data import DATA_DIR
from lama.util.decorators import enable_logging

logger = logging.getLogger()

OUT_DIR = os.path.join(DATA_DIR, "pre")
RESULT_DIR = os.path.join(DATA_DIR, "result")

In [None]:
train = pd.read_csv(os.path.join(OUT_DIR, "train_groupby.csv"))
test = pd.read_csv(os.path.join(OUT_DIR, "test_groupby.csv"))

features = train.columns.tolist()
features.remove("target")
features.remove("card_id")

In [None]:
# see <a href=https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python />
param_init = {
    'objective': 'reg:linear',
    # analog to learning rate in gbm
    'eta': 0.3,
    # similar to min_child_leaf in GBM but not exactly.
    'min_child_weight': 0.8,
    # L2 regularization
    'lambda': 0.2,
    # L1 regularization
    'alpha': 0.5,
    'eval_metric': 'rmse',
    'seed': 2022,
    'early_stopping_rounds': 30,
    'num_boost_round': 1000,
    'verbose_eval': 50,
}


def dump(model, filename):
    with open(os.path.join(RESULT_DIR, filename), 'w') as file:
        model.dump_model(file, dump_format="json")


@enable_logging("perform_xgb_kfold.log")
def perform_xgb_kfold(train, test, features, target, params, n_split=5, random_state=22):
    kf = KFold(n_splits=n_split, random_state=random_state, shuffle=True)
    prediction_test = 0
    eval_results= {}
    predictions = np.zeros(train.shape[0])
    for i, (train_index, validation_index) in enumerate(kf.split(train[features])):
        train_xgb = xgb.DMatrix(train[features].loc[train_index].values,
                                train[target].loc[train_index].values)
        validation_xgb = xgb.DMatrix(train[features].loc[validation_index].values,
                                     train[target].loc[validation_index].values)

        bst: xgb.Booster  = xgb.train(params, train_xgb,
                        num_boost_round=params['num_boost_round'],
                        early_stopping_rounds=params['early_stopping_rounds'],
                        evals_result=eval_results,
                        evals=[(train_xgb, 'train'), (validation_xgb, 'eval')],
                        verbose_eval=params['verbose_eval'])
        logger.debug(f'evaluate results in round {i}: {eval_results}')

        prediction_test += bst.predict(xgb.DMatrix(test[features].values))
        validation_pre = bst.predict(validation_xgb)
        score = np.sqrt(mean_squared_error(train[target].loc[validation_index].values, validation_pre))
        logger.debug(f'CV Score in {i} round: {score}')

        predictions[validation_index] = validation_pre
        dump(bst, f'XGBoost_{i}.json')
    return prediction_test, predictions


In [None]:
n_split = 5
prediction_test, predictions = perform_xgb_kfold(train, test, features, 'target', param_init, n_split=n_split)


In [None]:
test['target'] = prediction_test / n_split
test[['card_id', 'target']].to_csv(os.path.join(RESULT_DIR, "submission_xgboost_kfold.csv"), index=False)


prediction_csv = pd.read_csv(os.path.join(OUT_DIR, "prediction_train.csv"))
prediction_csv['target_4'] = predictions
# write also actual target
prediction_csv['target'] = train['target']
prediction_csv.to_csv(os.path.join(OUT_DIR, "prediction_train.csv"), index=False)

prediction_csv = pd.read_csv(os.path.join(OUT_DIR, "prediction_test.csv"))
prediction_csv['target_4'] = prediction_test
prediction_csv.to_csv(os.path.join(OUT_DIR, "prediction_test.csv"), index=False)

In [None]:
import matplotlib.pyplot as plt

def plot_train_validation(train, validation, target):
    plt.figure(figsize=(14,4))
    train_target = train[target]
    plt.plot(train_target, label='train')
    plt.plot(validation, label='validation')
    plt.xlabel("numbers")
    plt.ylabel(target)
    plt.legend()
    plt.show()
plot_train_validation(train, predictions, 'target')

# Stacking with previous model

We will implements a simple stacking model that takes only one layer.
Actually we know that there are many available frameworks like `autogluon` in automl.
 But this will make less fun.

In [85]:
# step one read datas
prediction_test = pd.read_csv(os.path.join(OUT_DIR, "prediction_test.csv"))
prediction_train = pd.read_csv(os.path.join(OUT_DIR, "prediction_train.csv"))

def get_prediction_as_stack(prediction):
    return prediction[['target_1', 'target_2', 'target_3', 'target_4']].values

train_hstack = get_prediction_as_stack(prediction_train)
test_hstack = get_prediction_as_stack(prediction_test)



In [86]:
# step two write our own kfold-stacking
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RepeatedKFold

n_splits, n_repeats, random_state = 5, 2, 2022

@enable_logging("perform_stacking_kfold_with_gradient.log")
def perform_stacking_kfold_with_gradient(train_hstack, test_hstack, y, n_splits=n_splits, n_repeats=n_repeats):
    rf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=2022)
    prediction_test = np.zeros(test_hstack.shape[0])
    # n_repeats * n_spilts in total
    for i, (train_index, validation_index) in enumerate(rf.split(train_hstack)):
        logger.debug(f"Fold in {i}")
        train_x, train_y = train_hstack[train_index], y[train_index]
        val_x, val_y = train_hstack[validation_index], y[validation_index]
        clf = MLPRegressor(activation='relu',
                           learning_rate='adaptive',
                           early_stopping=False)
        clf.fit(train_x, train_y)
        score = clf.score(val_x, val_y)
        logger.debug(f"Score in Fold {i} is {score}")
        prediction_test += clf.predict(test_hstack)
    prediction_test = prediction_test / (n_splits * n_repeats)
    return prediction_test

In [87]:
prediction_test = perform_stacking_kfold_with_gradient(train_hstack, test_hstack, prediction_train['target'])

test['target'] = prediction_test
test[['card_id', 'target']].to_csv(os.path.join(RESULT_DIR, "submission_stacking_kfold.csv"), index=False)