# Training with LighGBM

LightGBM is a gradient boosting framework that uses tree based learning algorithms,  which has following merits to orthodox decision tree model.(e.g. random forest)



- Faster training speed and higher efficiency.
- Lower memory usage.
- Better accuracy.
- Support of parallel, distributed, and GPU learning.
- Capable of handling large-scale data.

We will first implements a naive lightGBM model then use the technics from above. Hoping it achieves better than random-forest.

In [33]:
import os
import logging
import json

import lightgbm as lgb
import numpy as np
import pandas as pd


from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from data import DATA_DIR
from lama.util.decorators import enable_logging

logger = logging.getLogger()

OUT_DIR = os.path.join(DATA_DIR, "pre")
RESULT_DIR = os.path.join(DATA_DIR, "result")


In [34]:
train = pd.read_csv(os.path.join(OUT_DIR, "train_groupby.csv"))
test = pd.read_csv(os.path.join(OUT_DIR, "test_groupby.csv"))

features = train.columns.tolist()
features.remove("target")
features.remove("card_id")
label = 'target'

In [35]:
# parameters see <a href="https://lightgbm.readthedocs.io/en/latest/Parameters.html" />
# randomly picked parameters
param_init = {
    'num_leaves': 31,
    'learning_rate': 0.3,
    'min_child_samples': 20,
    'bagging_seed': 2022,
    'bagging_fraction': 0.7,
    # to enable bagging, bagging_freq should be set to a non zero value as well
    'bagging_freq': 1,
    'metric': 'rmse',
    'lambda_l1': 0.2,
    'lambda_l2': 1.0,
    'objective': 'regression'
}

# early-stopping-round
ESR = 30
# num boost round
NBR = 1000
# verbose evalutaion
VBE = 50


In [41]:
n_split=5

def dump(model, filename):
    with open(os.path.join(RESULT_DIR, filename), 'w') as file:
        file.write(json.dumps(model))

@enable_logging("perform_lgb_kfold.log")
def perform_lgb_kfold(train, test, features, label, n_splits=5, random_state=22):
    kf = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    prediction_test = 0
    cv_score = []
    eval_results= {}
    predictions = np.zeros(train.shape[0])
    logger.info("Start Loop")
    for i, (train_index, validation_index) in enumerate(kf.split(train[features])):
        train_lgb = lgb.Dataset(train[features].loc[train_index],
                                train[label].loc[train_index])
        validation_lgb = lgb.Dataset(train[features].loc[validation_index],
                                     train[label].loc[validation_index])
        bst = lgb.train(param_init, train_lgb,
                        valid_sets=[train_lgb, validation_lgb],
                        valid_names=['train', 'valid'],
                        callbacks=[lgb.early_stopping(ESR),
                                   lgb.log_evaluation(VBE),
                                   lgb.record_evaluation(eval_results)],
                        num_boost_round=NBR)
        logger.debug(f"evalute results in {i} round: {eval_results}")

        prediction_test += bst.predict(test[features])
        validation_pre = bst.predict(train[features].loc[validation_index])
        score = np.sqrt(mean_squared_error(train[label].loc[validation_index].values, validation_pre))

        logger.debug(f'CV Score in {i} round: {score}')

        cv_score.append(score)
        print(len(validation_index))
        print(len(validation_pre))
        dump(bst.dump_model(i), f"lightGBM_{i}.json")
    return prediction_test, predictions, cv_score

In [42]:
prediction_test, predictions, cv_score = perform_lgb_kfold(train, test, features, label)
test['target'] = prediction_test / n_split
test[['card_id', 'target']].to_csv(os.path.join(RESULT_DIR, "submission_light_gbm_kfold.csv"), index=False)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22838
[LightGBM] [Info] Number of data points in the train set: 159768, number of used features: 148
[LightGBM] [Info] Start training from score -0.028512
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[19]	train's rmse: 1.57653	valid's rmse: 1.60552


TypeError: cannot unpack non-iterable NoneType object