- Reference
  - https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
- Data
  - https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification

In [1]:
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

try:
    import cPickle as pickle
except BaseException:
    import pickle

In [3]:
print('Loading data...')

df_train = pd.read_csv('data/binary.train', header=None, sep='\t')
df_test = pd.read_csv('data/binary.test', header=None, sep='\t')
W_train = pd.read_csv('data/binary.train.weight', header=None)[0]
W_test = pd.read_csv('data/binary.test.weight', header=None)[0]

Loading data...


In [4]:
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

In [9]:
print(df_train.shape)
df_train.head(3)

(7000, 29)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,0.869,-0.635,0.226,0.327,-0.69,0.754,-0.249,-1.092,0.0,...,-0.01,-0.046,3.102,1.354,0.98,0.978,0.92,0.722,0.989,0.877
1,1,0.908,0.329,0.359,1.498,-0.313,1.096,-0.558,-1.588,2.173,...,-1.139,-0.001,0.0,0.302,0.833,0.986,0.978,0.78,0.992,0.798
2,1,0.799,1.471,-1.636,0.454,0.426,1.105,1.282,1.382,0.0,...,1.129,0.9,0.0,0.91,1.108,0.986,0.951,0.803,0.866,0.78


In [10]:
print(df_test.shape)
df_test.head(3)

(500, 29)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,0.644,0.247,-0.447,0.862,0.374,0.854,-1.126,-0.79,2.173,...,-0.19,-0.744,3.102,0.958,1.061,0.98,0.875,0.581,0.905,0.796
1,0,0.385,1.8,1.037,1.044,0.349,1.502,-0.966,1.734,0.0,...,-0.44,0.638,3.102,0.695,0.909,0.981,0.803,0.813,1.149,1.116
2,0,1.214,-0.166,0.004,0.505,1.434,0.628,-1.174,-1.23,1.087,...,-1.383,1.355,0.0,0.848,0.911,1.043,0.931,1.058,0.744,0.696


In [11]:
print(W_train.shape)
W_train.head(3)

(7000,)


0    1.2
1    1.1
2    1.0
Name: 0, dtype: float64

In [12]:
print(W_test.shape)
W_test.head(3)

(500,)


0    1.2
1    1.1
2    1.0
Name: 0, dtype: float64

In [15]:
num_train, num_feature = X_train.shape

In [16]:
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)

In [17]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
print('Starting training...')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  # eval training data
                feature_name=feature_name,
                categorical_feature=[21])

print('Finished first 10 rounds...')
print('7th feature name is:', lgb_train.feature_name[6])

Starting training...
[1]	training's binary_logloss: 0.680295
[2]	training's binary_logloss: 0.672016
[3]	training's binary_logloss: 0.664438
[4]	training's binary_logloss: 0.655529
[5]	training's binary_logloss: 0.647367
[6]	training's binary_logloss: 0.64078
[7]	training's binary_logloss: 0.635005
[8]	training's binary_logloss: 0.628445
[9]	training's binary_logloss: 0.622414
[10]	training's binary_logloss: 0.616798
Finished first 10 rounds...
7th feature name is: feature_6


In [21]:
print('Saving model...')

gbm.save_model('data/advanced_example_model.txt')

Saving model...


<lightgbm.basic.Booster at 0x124bd08d0>

In [22]:
print('Dumping model to JSON...')

model_json = gbm.dump_model()

with open('data/advanced_example_model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

Dumping model to JSON...


In [23]:
print('Feature names:', gbm.feature_name())
print('Feature importances:', list(gbm.feature_importance()))

Feature names: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27']
Feature importances: [8, 4, 0, 19, 8, 36, 3, 0, 2, 10, 5, 1, 0, 9, 5, 3, 0, 2, 2, 5, 1, 0, 35, 3, 28, 45, 31, 35]


In [24]:
print('Loading model to predict...')

bst = lgb.Booster(model_file='data/advanced_example_model.txt')

y_pred = bst.predict(X_test)

print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

Loading model to predict...
The rmse of loaded model's prediction is: 0.4618158981559113


In [25]:
print('Dumping and loading model with pickle...')

with open('data/advanced_example_model.pkl', 'wb') as fout:
    pickle.dump(gbm, fout)

with open('data/advanced_example_model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)

y_pred = pkl_bst.predict(X_test, num_iteration=7)

print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

Dumping and loading model with pickle...
The rmse of pickled model's prediction is: 0.4698928558113315


In [26]:
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='data/advanced_example_model.txt',
                valid_sets=lgb_eval)

print('Finished 10 - 20 rounds with model file...')

[11]	valid_0's binary_logloss: 0.613921
[12]	valid_0's binary_logloss: 0.610303
[13]	valid_0's binary_logloss: 0.606235
[14]	valid_0's binary_logloss: 0.601755
[15]	valid_0's binary_logloss: 0.597769
[16]	valid_0's binary_logloss: 0.594371
[17]	valid_0's binary_logloss: 0.590591
[18]	valid_0's binary_logloss: 0.587501
[19]	valid_0's binary_logloss: 0.584381
[20]	valid_0's binary_logloss: 0.581715
Finished 10 - 20 rounds with model file...


In [27]:
# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finished 30 - 40 rounds with changing bagging_fraction...')

[21]	valid_0's binary_logloss: 0.578808
[22]	valid_0's binary_logloss: 0.576947
[23]	valid_0's binary_logloss: 0.574259
[24]	valid_0's binary_logloss: 0.571787
[25]	valid_0's binary_logloss: 0.569801
[26]	valid_0's binary_logloss: 0.568328
[27]	valid_0's binary_logloss: 0.565944
[28]	valid_0's binary_logloss: 0.564784
[29]	valid_0's binary_logloss: 0.562965
[30]	valid_0's binary_logloss: 0.562213
Finished 30 - 40 rounds with changing bagging_fraction...


In [28]:
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess


# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# binary error
# NOTE: when you do customized loss function, the default prediction value is margin
# This may make built-in evalution metric calculate wrong results
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'error', np.mean(labels != (preds > 0.5)), False

In [29]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=binary_error,
                valid_sets=lgb_eval)

print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')

[31]	valid_0's binary_logloss: 5.05771	valid_0's error: 0.262
[32]	valid_0's binary_logloss: 5.05289	valid_0's error: 0.26
[33]	valid_0's binary_logloss: 5.1432	valid_0's error: 0.262
[34]	valid_0's binary_logloss: 5.13396	valid_0's error: 0.258
[35]	valid_0's binary_logloss: 5.06775	valid_0's error: 0.254
[36]	valid_0's binary_logloss: 5.12443	valid_0's error: 0.258
[37]	valid_0's binary_logloss: 5.11177	valid_0's error: 0.256
[38]	valid_0's binary_logloss: 5.16895	valid_0's error: 0.256
[39]	valid_0's binary_logloss: 5.22741	valid_0's error: 0.256
[40]	valid_0's binary_logloss: 5.40895	valid_0's error: 0.262
Finished 40 - 50 rounds with self-defined objective function and eval metric...


In [30]:
# another self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# accuracy
# NOTE: when you do customized loss function, the default prediction value is margin
# This may make built-in evalution metric calculate wrong results
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
def accuracy(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'accuracy', np.mean(labels == (preds > 0.5)), True

In [31]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=lambda preds, train_data: [binary_error(preds, train_data),
                                                  accuracy(preds, train_data)],
                valid_sets=lgb_eval)

print('Finished 50 - 60 rounds with self-defined objective function '
      'and multiple self-defined eval metrics...')

[41]	valid_0's binary_logloss: 5.39975	valid_0's error: 0.258	valid_0's accuracy: 0.742
[42]	valid_0's binary_logloss: 5.39204	valid_0's error: 0.256	valid_0's accuracy: 0.744
[43]	valid_0's binary_logloss: 5.14792	valid_0's error: 0.248	valid_0's accuracy: 0.752
[44]	valid_0's binary_logloss: 5.32308	valid_0's error: 0.252	valid_0's accuracy: 0.748
[45]	valid_0's binary_logloss: 5.3273	valid_0's error: 0.25	valid_0's accuracy: 0.75
[46]	valid_0's binary_logloss: 5.31388	valid_0's error: 0.248	valid_0's accuracy: 0.752
[47]	valid_0's binary_logloss: 5.3156	valid_0's error: 0.25	valid_0's accuracy: 0.75
[48]	valid_0's binary_logloss: 5.31168	valid_0's error: 0.242	valid_0's accuracy: 0.758
[49]	valid_0's binary_logloss: 5.36828	valid_0's error: 0.244	valid_0's accuracy: 0.756
[50]	valid_0's binary_logloss: 5.30726	valid_0's error: 0.242	valid_0's accuracy: 0.758
Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...


In [32]:
print('Starting a new training job...')


# callback
def reset_metrics():
    def callback(env):
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
            print('Add a new valid dataset at iteration 5...')
            env.model.add_valid(lgb_eval_new, 'new_valid')
    callback.before_iteration = True
    callback.order = 0
    return callback

Starting a new training job...


In [33]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                callbacks=[reset_metrics()])

print('Finished first 10 rounds with callback function...')

[1]	training's binary_logloss: 0.519128
[2]	training's binary_logloss: 0.517287
[3]	training's binary_logloss: 0.515305
[4]	training's binary_logloss: 0.513326
[5]	training's binary_logloss: 0.511376
Add a new valid dataset at iteration 5...
[6]	training's binary_logloss: 0.509532	new_valid's binary_logloss: 0.676463
[7]	training's binary_logloss: 0.507538	new_valid's binary_logloss: 0.67376
[8]	training's binary_logloss: 0.505539	new_valid's binary_logloss: 0.671616
[9]	training's binary_logloss: 0.503921	new_valid's binary_logloss: 0.669828
[10]	training's binary_logloss: 0.502099	new_valid's binary_logloss: 0.66739
Finished first 10 rounds with callback function...
