This jupyter notebook uses LightGBM API from https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import time

# train_val

In [2]:
# load data
X_train = pd.read_csv('../data/train_val/X_train.csv')
X_val = pd.read_csv('../data/train_val/X_val.csv')
y_train = pd.read_csv('../data/train_val/y_train.csv')
y_val = pd.read_csv('../data/train_val/y_val.csv')

# Drop transaction id column
X_train.drop(columns = ['TransactionID'], inplace = True)
X_val.drop(columns = ['TransactionID'], inplace = True)
y_train.drop(columns = ['TransactionID'], inplace = True)
y_val.drop(columns = ['TransactionID'], inplace = True)

In [3]:
train_data = lgb.Dataset(X_train, label = y_train)
val_data = lgb.Dataset(X_val, label = y_val, reference = train_data)

In [4]:
param = {'num_leaves': 31, 
         'objective': 'binary',
         'metric': 'auc'}

In [5]:
start = time.time()

num_round = 500
bst = lgb.train(param, train_data, num_round, valid_sets = [val_data], early_stopping_rounds=20)
# cv_bst = lgb.cv(param, train_data, num_round, nfold=5)

end = time.time()
print(f"Runtime of the program is {end - start}")

[LightGBM] [Info] Number of positive: 15515, number of negative: 427390
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23108
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035030 -> initscore=-3.315890
[LightGBM] [Info] Start training from score -3.315890
[1]	valid_0's auc: 0.787765
Training until validation scores don't improve for 20 rounds
[2]	valid_0's auc: 0.834649
[3]	valid_0's auc: 0.837784
[4]	valid_0's auc: 0.841588
[5]	valid_0's auc: 0.845685
[6]	valid_0's auc: 0.848301
[7]	valid_0's auc: 0.849182
[8]	valid_0's auc: 0.852232
[9]	valid_0's auc: 0.853845
[10]	valid_0's auc: 0.85495
[11]	valid_0's auc: 0.858606
[12]	valid_0's auc: 0.860523
[13]	valid_0's auc: 0.86094
[14]	valid_0's auc: 0.862067
[15]	valid_0's auc: 0.865101
[16]	valid_0's auc: 0.866178
[17]	valid_0's auc: 0.867287

# train_val_uid

In [9]:
# load data
X_train_uid = pd.read_csv('../data/train_val_uid/X_train.csv')
X_val_uid = pd.read_csv('../data/train_val_uid/X_val.csv')
y_train_uid = pd.read_csv('../data/train_val_uid/y_train.csv')
y_val_uid = pd.read_csv('../data/train_val_uid/y_val.csv')

# Drop transaction id column
X_train_uid.drop(columns = ['TransactionID'], inplace = True)
X_val_uid.drop(columns = ['TransactionID'], inplace = True)
y_train_uid.drop(columns = ['TransactionID'], inplace = True)
y_val_uid.drop(columns = ['TransactionID'], inplace = True)

In [10]:
train_data_uid = lgb.Dataset(X_train_uid, label = y_train_uid)
val_data_uid = lgb.Dataset(X_val_uid, label = y_val_uid, reference = train_data_uid)

In [11]:
param = {'num_leaves': 31, 
         'objective': 'binary',
         'metric': 'auc'}

In [14]:
start = time.time()

num_round = 500
bst_uid = lgb.train(param, train_data_uid, num_round, valid_sets = [val_data_uid], early_stopping_rounds=20)
# cv_bst = lgb.cv(param, train_data, num_round, nfold=5)

end = time.time()
print(f"Runtime of the program is {end - start}")

[LightGBM] [Info] Number of positive: 15518, number of negative: 427387
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23116
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035037 -> initscore=-3.315689
[LightGBM] [Info] Start training from score -3.315689
[1]	valid_0's auc: 0.786708
Training until validation scores don't improve for 20 rounds
[2]	valid_0's auc: 0.83328
[3]	valid_0's auc: 0.836397
[4]	valid_0's auc: 0.839735
[5]	valid_0's auc: 0.841831
[6]	valid_0's auc: 0.847224
[7]	valid_0's auc: 0.850672
[8]	valid_0's auc: 0.854141
[9]	valid_0's auc: 0.855172
[10]	valid_0's auc: 0.854999
[11]	valid_0's auc: 0.856813
[12]	valid_0's auc: 0.859255
[13]	valid_0's auc: 0.861535
[14]	valid_0's auc: 0.861421
[15]	valid_0's auc: 0.865274
[16]	valid_0's auc: 0.866998
[17]	valid_0's auc: 0.86862