## Global Settings and Import

In [35]:
import sys, os
sys.path.append("../../")
import numpy as np
import lightgbm as lgb
import papermill as pm
import pandas as pd
import category_encoders as ce
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss

import import_ipynb
import lightgbm_utils as lgb_utils
#import reco_utils.dataset.criteo as criteo

print("System version: {}".format(sys.version))
print("LightGBM version: {}".format(lgb.__version__))

System version: 3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]
LightGBM version: 3.0.0


## Parameter Setting

In [36]:

MAX_LEAF = 64
MIN_DATA = 20
NUM_OF_TREES = 100
TREE_LEARNING_RATE = 0.15
EARLY_STOPPING_ROUNDS = 20
METRIC = "auc"
SIZE = "sample"

In [37]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'num_class': 1,
    'objective': "binary",
    'metric': METRIC,
    'num_leaves': MAX_LEAF,
    'min_data': MIN_DATA,
    'boost_from_average': True,
    #set it according to your cpu cores.
    'num_threads': 20,
    'feature_fraction': 0.8,
    'learning_rate': TREE_LEARNING_RATE,
}

## Import Data

In [38]:
all_data = pd.read_csv("final_snack_data.csv")
all_data.head()

Unnamed: 0,user_id,Item_id,Rating,Label,I1,I2,I3,I4,I5,I6,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,50,7,3,0,1.0,1,5.0,0.0,1382.0,4.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,39,2,2,0,2.0,0,44.0,1.0,102.0,8.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,13,9,2,0,2.0,0,1.0,14.0,767.0,89.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,40,7,2,0,,893,,,4392.0,,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,41,13,2,0,3.0,-1,,0.0,2.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [39]:
nume_cols = ["I" + str(i) for i in range(1, 14)]
cate_cols = ["C" + str(i) for i in range(1, 27)]
label_col = "Label"

In [40]:
all_data.head()

Unnamed: 0,user_id,Item_id,Rating,Label,I1,I2,I3,I4,I5,I6,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,50,7,3,0,1.0,1,5.0,0.0,1382.0,4.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,39,2,2,0,2.0,0,44.0,1.0,102.0,8.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,13,9,2,0,2.0,0,1.0,14.0,767.0,89.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,40,7,2,0,,893,,,4392.0,,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,41,13,2,0,3.0,-1,,0.0,2.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [41]:
# split data to 3 sets    
length = len(all_data)
train_data = all_data.loc[:0.8*length-1]
valid_data = all_data.loc[0.8*length:0.9*length-1]
test_data = all_data.loc[0.9*length:]

## Basic Usage
### Ordinal Encoding

In [42]:
ord_encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)

def encode_csv(df, encoder, label_col, typ='fit'):
    if typ == 'fit':
        df = encoder.fit_transform(df)
    else:
        df = encoder.transform(df)
    y = df[label_col].values
    del df[label_col]
    return df, y

train_x, train_y = encode_csv(train_data, ord_encoder, label_col)
valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, 'transform')
test_x, test_y = encode_csv(test_data, ord_encoder, label_col, 'transform')

print('Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\n'
      .format(trn_x_shape=train_x.shape,
              trn_y_shape=train_y.shape,
              vld_x_shape=valid_x.shape,
              vld_y_shape=valid_y.shape,
              tst_x_shape=test_x.shape,
              tst_y_shape=test_y.shape,))
train_x.head()

Train Data Shape: X: (8000, 42); Y: (8000,).
Valid Data Shape: X: (1000, 42); Y: (1000,).
Test Data Shape: X: (1000, 42); Y: (1000,).



Unnamed: 0,user_id,Item_id,Rating,I1,I2,I3,I4,I5,I6,I7,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,50,7,3,1.0,1,5.0,0.0,1382.0,4.0,15.0,...,1,1,1,1,1,1,1,1,1,1
1,39,2,2,2.0,0,44.0,1.0,102.0,8.0,2.0,...,2,2,1,2,2,1,1,2,1,2
2,13,9,2,2.0,0,1.0,14.0,767.0,89.0,4.0,...,3,3,2,3,3,2,1,3,2,3
3,40,7,2,,893,,,4392.0,,0.0,...,4,4,2,3,4,1,1,4,2,3
4,41,13,2,3.0,-1,,0.0,2.0,0.0,3.0,...,4,5,2,3,5,1,2,5,2,3


## Create Model

In [43]:
lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params, categorical_feature=cate_cols)
lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)
lgb_test = lgb.Dataset(test_x, test_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)
lgb_model = lgb.train(params,
                      lgb_train,
                      num_boost_round=NUM_OF_TREES,
                      early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                      valid_sets=lgb_valid,
                      categorical_feature=cate_cols)

[LightGBM] [Info] Number of positive: 1728, number of negative: 6272
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8368
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216000 -> initscore=-1.289131
[LightGBM] [Info] Start training from score -1.289131
[1]	valid_0's auc: 0.654173
Training until validation scores don't improve for 20 rounds
[2]	valid_0's auc: 0.654942
[3]	valid_0's auc: 0.664734
[4]	valid_0's auc: 0.664914
[5]	valid_0's auc: 0.674381
[6]	valid_0's auc: 0.678155
[7]	valid_0's auc: 0.678046
[8]	valid_0's auc: 0.682877
[9]	valid_0's auc: 0.681081
[10]	valid_0's auc: 0.682442
[11]	valid_0's auc: 0.68438
[12]	valid_0's auc: 0.686821
[13]	valid_0's auc: 0.687427




[14]	valid_0's auc: 0.687587
[15]	valid_0's auc: 0.686624
[16]	valid_0's auc: 0.686572
[17]	valid_0's auc: 0.685349
[18]	valid_0's auc: 0.687676
[19]	valid_0's auc: 0.687356
[20]	valid_0's auc: 0.685658
[21]	valid_0's auc: 0.686264
[22]	valid_0's auc: 0.686452
[23]	valid_0's auc: 0.686715
[24]	valid_0's auc: 0.688922
[25]	valid_0's auc: 0.688842
[26]	valid_0's auc: 0.68835
[27]	valid_0's auc: 0.691123
[28]	valid_0's auc: 0.691214
[29]	valid_0's auc: 0.690203
[30]	valid_0's auc: 0.689442
[31]	valid_0's auc: 0.688133
[32]	valid_0's auc: 0.689591
[33]	valid_0's auc: 0.690334
[34]	valid_0's auc: 0.691169
[35]	valid_0's auc: 0.691072
[36]	valid_0's auc: 0.689408
[37]	valid_0's auc: 0.689288
[38]	valid_0's auc: 0.690443
[39]	valid_0's auc: 0.689842
[40]	valid_0's auc: 0.689848
[41]	valid_0's auc: 0.689345
[42]	valid_0's auc: 0.688945
[43]	valid_0's auc: 0.688619
[44]	valid_0's auc: 0.688642
[45]	valid_0's auc: 0.687933
[46]	valid_0's auc: 0.687596
[47]	valid_0's auc: 0.688785
[48]	valid_0's

In [44]:
test_preds = lgb_model.predict(test_x)
auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))
logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)
res_basic = {"auc": auc, "logloss": logloss}
print(res_basic)
#pm.record("res_basic", res_basic)

{'auc': 0.7391941641668939, 'logloss': 0.4764193594328149}


## Optimized Usage
### Label-encoding and Binary-encoding

In [45]:
label_col = 'Label'
num_encoder = lgb_utils.NumEncoder(cate_cols, nume_cols, label_col)
train_x, train_y = num_encoder.fit_transform(train_data)
valid_x, valid_y = num_encoder.transform(valid_data)
test_x, test_y = num_encoder.transform(test_data)
del num_encoder
print('Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\n'
      .format(trn_x_shape=train_x.shape,
              trn_y_shape=train_y.shape,
              vld_x_shape=valid_x.shape,
              vld_y_shape=valid_y.shape,
              tst_x_shape=test_x.shape,
              tst_y_shape=test_y.shape,))

2020-11-16 01:24:07,739 [INFO] Filtering and fillna features
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 14.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 686.85it/s]
2020-11-16 01:24:09,524 [INFO] Ordinal encoding cate features
2020-11-16 01:24:09,823 [INFO] Target encoding cate features
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 27.45it/s]
2020-11-16 01:24:10,774 [INFO] Start manual binary encoding
100%|██████████████████████████████████████████████████████████████████████████████████| 65/65 [00:03<00:00, 17.63it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 15.49it/s]
2020-11-16 01:24:16,259 [INFO] Filtering and fillna features
100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [

Train Data Shape: X: (8000, 209); Y: (8000, 1).
Valid Data Shape: X: (1000, 209); Y: (1000, 1).
Test Data Shape: X: (1000, 209); Y: (1000, 1).



## Training and Evaluation

In [46]:
lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params)
lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train)
lgb_model = lgb.train(params,
                      lgb_train,
                      num_boost_round=NUM_OF_TREES,
                      early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                      valid_sets=lgb_valid)

[LightGBM] [Info] Number of positive: 1728, number of negative: 6272
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15280
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 207
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216000 -> initscore=-1.289131
[LightGBM] [Info] Start training from score -1.289131
[1]	valid_0's auc: 0.643574
Training until validation scores don't improve for 20 rounds
[2]	valid_0's auc: 0.6782
[3]	valid_0's auc: 0.679549
[4]	valid_0's auc: 0.68496
[5]	valid_0's auc: 0.680404
[6]	valid_0's auc: 0.688502
[7]	valid_0's auc: 0.692261
[8]	valid_0's auc: 0.693164
[9]	valid_0's auc: 0.692964
[10]	valid_0's auc: 0.698521
[11]	valid_0's auc: 0.699304
[12]	valid_0's auc: 0.697717
[13]	valid_0's auc: 0.695222
[14]	valid_0's auc: 0.695228
[15]	valid_0's auc: 0.693513
[16]	valid_0's auc: 0.695311
[17]	valid_0's auc: 0.693178
[18]	

In [47]:
test_preds = lgb_model.predict(test_x)
auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))
logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)
res_optim = {"auc": auc, "logloss": logloss}
print(res_optim)
#pm.record("res_optim", res_optim)

{'auc': 0.7110859240069084, 'logloss': 0.4842206442402837}


## Model saving and loading

In [48]:
with TemporaryDirectory() as tmp:
    save_file = os.path.join(tmp, r'finished.model')
    lgb_model.save_model(save_file)
    loaded_model = lgb.Booster(model_file=save_file)

# eval the performance again
test_preds = loaded_model.predict(test_x)

auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))
logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)
print({"auc": auc, "logloss": logloss})

{'auc': 0.7110859240069084, 'logloss': 0.4842206442402837}
