In [1]:
import numpy as np
import pandas as pd
import model
from typing import Union

import lightgbm as lgb

In [2]:
data = pd.read_parquet('../aai_banca_ls_v3/feature/temp_2021_09_30_12_55_22/part-00000-c1c51e7f-d1f5-4197-babd-9451ffe596b9-c000.snappy.parquet')

# Convert AGE from string to float
# AGE contains None, consequence of forgeting converting to pandas NA when generating features
data['AGE'] = data['AGE'].fillna(np.nan).astype({'AGE': float})

# Convert GENDER to integer, in order to be able to convert to categorical later in model
data['GENDER'] = data['GENDER'].astype('category').cat.codes

# Convert CITY using target encoding
city_label = (
    data
    .groupby('CITY')
    .agg({'LABEL': np.mean})
    .reset_index()
)
city_label_dict = dict(zip(city_label['CITY'], city_label['LABEL']))
data['CITY'] = data['CITY'].replace(city_label_dict)
data = data.sample(frac=0.1)

print(data.shape)
data.head()

(13127, 615)


Unnamed: 0,CUSID,REF_TIME,LABEL_TIME,LABEL,KIND,TD_MAX_TD_AVG_BAL_6M,TD_MEAN_TD_AVG_BAL_6M,TD_MAX_NO_AR_TD_6M,TD_MAX_TD_AVG_BAL_3M,TD_MEAN_TD_AVG_BAL_3M,...,PRODUCT_BOND_6M,PRODUCT_HOME_EQUITY_6M,PRODUCT_CASA_6M,PRODUCT_CC_6M,PRODUCT_TD_6M,PRODUCT_LD_MORTGAGE_PROJECT_6M,PRODUCT_LD_MORTGAGE_NORMAL_6M,PRODUCT_LD_AUTO_6M,PRODUCT_LD_PASSBOOK_6M,PRODUCT_LD_HOUSEHOLD_6M
3270,34107182,20200731,202009,0,train,,,,,,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124820,36057829,20200531,202007,0,train,,,,,,...,,,,,,,,,,
94618,26262720,20210228,202104,0,train,8571429.0,8571429.0,1.0,8571429.0,8571429.0,...,0.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
119518,32582022,20200930,202011,0,test,,,,,,...,0.0,0.0,6.0,6.0,0.0,0.0,6.0,0.0,0.0,0.0
30077,30949403,20210131,202103,0,train,,,,,,...,,,,,,,,,,


In [3]:
all_cols = data.columns
label_col = 'LABEL'
feat_cols = [col for col in all_cols if col not in ['CUSID', 'REF_TIME', 'LABEL_TIME', 'LABEL', 'KIND']]

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = model.train_val_test_split(data, val_size=1, test_size=1, label_col='LABEL', feat_cols=feat_cols, label_time_col='LABEL_TIME')
train, val, test = model.convert_to_lgb_data(X_train, X_val, X_test, y_train, y_val, y_test)

In [5]:
params_dict = {
    # tree booster
    'learning_rate':[0.1, 0.2],
    # 'eta': [0.01, 0.05, 0.10],
    # 'subsample': [0.3, 0.5, 0.7],
    'max_leaves': [64, 256, 1024],
    # 'max_depth': [4, 16, 32],
    # 'max_bin': [64, 256, 1024],
    # 'lambda': [1, 10, 100],
    # 'alpha': [0, 10, 100],

    # learning task
    'objective': ['binary'],
    'metric': ['auc'],
    'verbose': [-1]
}

In [7]:
grid_search = model.grid_search_lgb(train=train, val=val, params_dict=params_dict)

There are 6 hyperparameter sets.
Finishing    1/6             ---> Remaining 00:00:19
Finishing    2/6             ---> Remaining 00:00:28
Finishing    3/6             ---> Remaining 00:00:33
Finishing    4/6             ---> Remaining 00:00:19
Finishing    5/6             ---> Remaining 00:00:09
Finishing    6/6             ---> Remaining 00:00:00
Done


In [None]:
random.shuffle(list1)

In [8]:
grid_search

Unnamed: 0,learning_rate,max_leaves,objective,metric,verbose,metric_val_last,metric_val_max,metric_val_max_index,gap_at_val_max
0,0.1,64,binary,auc,-1,0.640783,0.670509,0,0.166083
1,0.1,256,binary,auc,-1,0.609884,0.683976,20,0.309629
2,0.1,1024,binary,auc,-1,0.621128,0.669902,35,0.324569
3,0.2,64,binary,auc,-1,0.56839,0.734652,2,0.185186
4,0.2,256,binary,auc,-1,0.645151,0.693845,11,0.29967
5,0.2,1024,binary,auc,-1,0.672005,0.709051,14,0.285067


In [30]:
params={'learning_rate': 0.2, 'max_leaves': 1024, 'objective': 'binary', 'metric': 'auc'}

In [32]:
evals_result

{'train': OrderedDict([('auc',
               [0.9205950243788019,
                0.9515667056808684,
                0.9695284170497268,
                0.9782565761344807,
                0.9831135397988541,
                0.9864338063308904,
                0.9887733347241133,
                0.990455507793089,
                0.9916322876627139,
                0.9927643260704792])]),
 'val': OrderedDict([('auc',
               [0.5482488069238858,
                0.5978726846234733,
                0.5862654695462266,
                0.595729191943703,
                0.6313192590795115,
                0.6567985116881015,
                0.653158618458303,
                0.648386314001456,
                0.6711154250586427,
                0.6565558521394483])])}

In [38]:
evals_result = {}
booster = lgb.train(
    params=params,
    train_set=train,
    num_boost_round=10,
    valid_sets=[train, val],
    valid_names=['train', 'val'],
    verbose_eval=1,
    evals_result=evals_result
)

[LightGBM] [Info] Number of positive: 341, number of negative: 11077
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 73303
[LightGBM] [Info] Number of data points in the train set: 11418, number of used features: 554
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029865 -> initscore=-3.480744
[LightGBM] [Info] Start training from score -3.480744
[1]	train's auc: 0.920595	val's auc: 0.548249
[2]	train's auc: 0.951567	val's auc: 0.597873
[3]	train's auc: 0.969528	val's auc: 0.586265
[4]	train's auc: 0.978257	val's auc: 0.595729
[5]	train's auc: 0.983114	val's auc: 0.631319
[6]	train's auc: 0.986434	val's auc: 0.656799
[7]	train's auc: 0.988773	val's auc: 0.653159
[8]	train's auc: 0.990456	val's auc: 0.648386
[9]	train's auc: 0.991632	val's auc: 0.671115
[10]	train's auc: 0.992764	val's auc: 0.656556


In [18]:
a = model.train(train, params={'learning_rate': 0.2, 'max_leaves': 1024, 'objective': 'binary', 'metric': 'auc'}, num_boost_round=14)

[LightGBM] [Info] Number of positive: 341, number of negative: 11077
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73303
[LightGBM] [Info] Number of data points in the train set: 11418, number of used features: 554
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029865 -> initscore=-3.480744
[LightGBM] [Info] Start training from score -3.480744


In [9]:
booster = model.get_best_model(train=train, params_dict=params_dict, grid_search=grid_search, criteria='metric_val_max')

In [59]:
params1={
    'max_leaves': 64,
    'objective': 'binary',
    'metric': 'auc',
    'verbose': -1
}
params2={
    'max_leaves': 60,
    'objective': 'abc',
    'metric': 'zxv',
    'verbose': -2
}

In [72]:
pd.DataFrame([params1, params2]).iloc[0].to_dict()

{'max_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'verbose': -1}

In [57]:
params.update(params)

In [58]:
params

{'max_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'verbose': -1}

In [47]:
evals_result = {}
booster = lgb.train(
    params=params,
    train_set=train,
    num_boost_round=100,
    valid_sets=[train, val],
    valid_names=['train', 'val'],
    early_stopping_rounds=10,
    verbose_eval=False,
    evals_result=evals_result
)

In [49]:
evals_result['a'] = 9

In [50]:
evals_result

{'train': OrderedDict([('auc',
               [0.8147678621822373,
                0.8768427888516215,
                0.8918853152963816,
                0.9089427978438376,
                0.9173831718060076,
                0.9303648740713982,
                0.9401516186118661,
                0.9460012236669947,
                0.9595053555514496,
                0.9647809617939652,
                0.9713892041059444,
                0.9735026340455669,
                0.9770898026973021,
                0.9804007263222898,
                0.9827919561387165,
                0.9839390467680758,
                0.9847330568637171,
                0.985643810284452,
                0.9863162829339917])]),
 'val': OrderedDict([('auc',
               [0.42549947423764456,
                0.5032071503680337,
                0.46514195583596213,
                0.47534174553102,
                0.531335436382755,
                0.5105152471083071,
                0.4943743427970557,
  

In [51]:
params.update(evals_result)

In [52]:
params

{'max_leaves': 64,
 'objective': 'binary',
 'metric': 'auc',
 'verbose': -1,
 'train': OrderedDict([('auc',
               [0.8147678621822373,
                0.8768427888516215,
                0.8918853152963816,
                0.9089427978438376,
                0.9173831718060076,
                0.9303648740713982,
                0.9401516186118661,
                0.9460012236669947,
                0.9595053555514496,
                0.9647809617939652,
                0.9713892041059444,
                0.9735026340455669,
                0.9770898026973021,
                0.9804007263222898,
                0.9827919561387165,
                0.9839390467680758,
                0.9847330568637171,
                0.985643810284452,
                0.9863162829339917])]),
 'val': OrderedDict([('auc',
               [0.42549947423764456,
                0.5032071503680337,
                0.46514195583596213,
                0.47534174553102,
                0.53133543638275

In [48]:
evals_result

{'train': OrderedDict([('auc',
               [0.8147678621822373,
                0.8768427888516215,
                0.8918853152963816,
                0.9089427978438376,
                0.9173831718060076,
                0.9303648740713982,
                0.9401516186118661,
                0.9460012236669947,
                0.9595053555514496,
                0.9647809617939652,
                0.9713892041059444,
                0.9735026340455669,
                0.9770898026973021,
                0.9804007263222898,
                0.9827919561387165,
                0.9839390467680758,
                0.9847330568637171,
                0.985643810284452,
                0.9863162829339917])]),
 'val': OrderedDict([('auc',
               [0.42549947423764456,
                0.5032071503680337,
                0.46514195583596213,
                0.47534174553102,
                0.531335436382755,
                0.5105152471083071,
                0.4943743427970557,
  

In [39]:
metric = list(evals_result['train'].keys())[-1]
metric_train = evals_result['train'][metric]
metric_val = evals_result['val'][metric]
metric_gap = [x - y for x, y in zip(metric_train, metric_val)]

evals_result['metric_val_last'] = metric_val[-1]
evals_result['metric_val_max'] = max(metric_val)
evals_result['metric_val_max_index'] = metric_val.index(metric_val_max)
evals_result['gap_at_val_max'] = metric_gap[metric_val_max_index]



overfit_max_metric = [
    metric_val_last, metric_val_max, metric_val_max_index, gap_at_val_max]
grid_search_i = list(params.values()) + overfit_max_metric

# append loop result
grid_search.append(grid_search_i)

Unnamed: 0,0,gap_at_val_max,learning_rate,max_leaves,metric,metric_val_last,metric_val_max,metric_val_max_index,objective,verbose
0,,0.408621,0.1,64.0,auc,0.500946,0.58633,89.0,binary,-1.0
1,,0.361739,0.1,256.0,auc,0.461935,0.617666,4.0,binary,-1.0
2,,0.331715,0.1,1024.0,auc,0.45878,0.650999,5.0,binary,-1.0
3,,0.364791,0.2,64.0,auc,0.505783,0.6204,11.0,binary,-1.0
4,,0.463145,0.2,256.0,auc,0.407571,0.532492,59.0,binary,-1.0
5,,0.481728,0.2,1024.0,auc,0.387592,0.490116,2.0,binary,-1.0
0,64,,,,,,,,,
1,binary,,,,,,,,,
2,auc,,,,,,,,,
3,-1,,,,,,,,,


In [23]:
evals_result.keys()

dict_keys(['train', 'val'])