In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import lightgbm as lgb
from sklearn import metrics
import gc

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
root_path = '/kaggle/input/'
import os
#for dirname, _, filenames in os.walk(root_path):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))
mode = 'another_refit_test'
# Any results you write to the current directory are saved as output.

In [2]:
import ctypes
from lightgbm import libpath as lbp
find_lib_path = lbp.find_lib_path

def _load_lib():
    """Load LightGBM library."""
    lib_path = find_lib_path()
    if len(lib_path) == 0:
        return None
    lib = ctypes.cdll.LoadLibrary(lib_path[0])
    lib.LGBM_GetLastError.restype = ctypes.c_char_p
    return lib

_LIB = _load_lib()

def _safe_call(ret):
    """Check the return value from C API call.
    Parameters
    ----------
    ret : int
        The return value from C API calls.
    """
    if ret != 0:
        raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))

def set_leaf_output(booster_obj, tree_id, leaf_id, value):
    """Get the output of a leaf.
    Parameters
    ----------
    tree_id : int
        The index of the tree.
    leaf_id : int
        The index of the leaf in the tree.
    Returns
    -------
    result : float
        The output of the leaf.
    """
    ret = ctypes.c_double(0)
    _safe_call(_LIB.LGBM_BoosterSetLeafValue(
        booster_obj.handle,
        ctypes.c_int(tree_id),
        ctypes.c_int(leaf_id),
        ctypes.c_double(value),
        ctypes.byref(ret)))
    return ret.value

In [3]:
raw_tr_rows_cnt = 45840617 # rows in training set
raw_ts_rows_cnt = 6042135  # rows in testing set
I_col_names = ['I{}'.format(i) for i in range(1, 14)]
C_col_names = ['C{}'.format(i) for i in range(1, 27)]

In [4]:
data_list = ['/kaggle/input/dataset-generator-10/train_{}_{}.csv'.format(i, i+10) for i in range(50, 91, 10)]

In [5]:
param = {'num_leaves': 128, 
             'num_trees': 64, 
             'objective': 'binary',
             'metrics':['auc','binary_logloss'],
             'learning_rate':0.3}
name = '_'.join([str(param[k]) + (k) for k in ['num_leaves', 'num_trees', 'learning_rate']])

In [6]:
if mode == 'train':
    lgb_train = lgb.Dataset('/kaggle/input/dataset-generator/train_0_50.csv',
                            feature_name=I_col_names + C_col_names, 
                            categorical_feature=C_col_names,
                            free_raw_data=False)
    lgb_test = [lgb.Dataset(tp,
                            reference=lgb_train) for tp in data_list[1:]]


    gbm=lgb.train(param, lgb_train, valid_sets = lgb_test)
    # this param performs best.
    gbm_str = gbm.model_to_string()
    with open('./{}.lgbm'.format(name),'wt') as f:
        f.write(gbm_str)

    del lgb_train
    for _ in range(len(lgb_test)):
        del lgb_test[0].data
        del lgb_test[0]
    gc.collect()
    gbm.free_dataset()
    gc.collect()

In [7]:
if mode != 'train':
    gbm = lgb.Booster(model_file="/kaggle/input/train-lgbm1/"+name+".lgbm")
    """
    Or you could simply achieve this by
    lgbm_file = "/kaggle/input/train-lgbm/128num_leaves_32num_trees_0.9learning_rate.lgbm"
    lgbm_str = ""
    with open(lgbm_file,'r') as f:
        lgbm_str = f.read()
    gbm = lgb.Booster(model_str=lgbm_str)
    """

In [8]:
# This code block is for some API test
# gbm_json = gbm.dump_model()
# print(len(gbm_json['tree_info']))

In [9]:
#### Baseline of refit function
temp_gbm = gbm
if mode == "baseline_refit":
    for ind, (train_data, eval_data) in enumerate(zip(data_list[:-1],data_list[1:])):
        print('use',train_data.split('/')[-1],'for training,', eval_data.split('/')[-1], 'for evaluation')
        nx = pd.read_csv(train_data,
                        names=['label']+I_col_names + C_col_names,
                        )
        temp_gbm = temp_gbm.refit(nx.drop(['label'],axis=1), nx['label'],decay_rate= 0.9)
        del nx
        gc.collect()
        gt = pd.read_csv(eval_data,
                        names=['label']+I_col_names + C_col_names)
        pred = temp_gbm.predict(gt.drop(['label'],axis = 1))
        print(metrics.log_loss(gt['label'], pred),'log_loss')
        print(metrics.roc_auc_score(gt['label'], pred),'auc')
        del gt
        gc.collect()

In [10]:
gbm_list = [gbm]
if mode == "another_refit":
    for ind, (train_data, eval_data) in enumerate(zip(data_list[:-1],data_list[1:])):
        print('use',train_data.split('/')[-1],'for training,', eval_data.split('/')[-1], 'for evaluation')
        #nx = pd.read_csv(train_data,
        #                names=['label']+I_col_names + C_col_names,
        #                )
        #temp_gbm = temp_gbm.refit(nx.drop(['label'],axis=1), nx['label'],decay_rate= 0.9)
        #del nx
        new_train = lgb.Dataset(train_data, feature_name=I_col_names + C_col_names, categorical_feature=C_col_names, free_raw_data=False)
        new_val = lgb.Dataset(eval_data, reference=new_train)
        gbm_new = lgb.train(param, new_train, valid_sets=new_val)
        gbm_list.append(gbm_new)
        gc.collect()
        gt = pd.read_csv(eval_data,
                        names=['label']+I_col_names + C_col_names)
        pred = 0.5 * gbm_list[0].predict(gt.drop(['label'],axis = 1))
        for k, gbm_item in enumerate(gbm_list[1:]):
            pred = pred + 0.1 * gbm_item.predict(gt.drop(['label'],axis = 1))
        pred = pred / (0.5 + 0.1 * ind)
        print(metrics.log_loss(gt['label'], pred),'log_loss')
        print(metrics.roc_auc_score(gt['label'], pred),'auc')
        del gt
        gc.collect()

In [11]:
gbm_list = [gbm]
alpha = 2.5
if mode == "RBM":
    for ind, (train_data, eval_data) in enumerate(zip(data_list[:-1],data_list[1:])):
        print('use',train_data.split('/')[-1],'for training,', eval_data.split('/')[-1], 'for evaluation')
        #nx = pd.read_csv(train_data,
        #                names=['label']+I_col_names + C_col_names,
        #                )
        #temp_gbm = temp_gbm.refit(nx.drop(['label'],axis=1), nx['label'],decay_rate= 0.9)
        #del nx
        new_train = lgb.Dataset(train_data, feature_name=I_col_names + C_col_names, categorical_feature=C_col_names, free_raw_data=False)
        new_val = lgb.Dataset(eval_data, reference=new_train)
        gbm_new = lgb.train(param, new_train, valid_sets=new_val)
        
        gt = pd.read_csv(eval_data,
                        names=['label']+I_col_names + C_col_names)
        pred = 0.5 * gbm_list[0].predict(gt.drop(['label'],axis = 1))
        for k, gbm_item in enumerate(gbm_list[1:]):
            pred = pred + alpha * 0.1 * gbm_item.predict(gt.drop(['label'],axis = 1))
            
        pred = pred + 0.1 * gbm_new.predict(gt.drop(['label'],axis = 1))
        gbm_list.append(gbm_new)
        gc.collect()
        pred = pred / (0.5 + 0.1*alpha*(ind-1) + 0.1)
        print(metrics.log_loss(gt['label'], pred),'log_loss')
        print(metrics.roc_auc_score(gt['label'], pred),'auc')
        del gt
        gc.collect()

use train_50_60.csv for training, train_60_70.csv for evaluation


New categorical_feature is ['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C2', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's auc: 0.741819	valid_0's binary_logloss: 0.534711
[2]	valid_0's auc: 0.750773	valid_0's binary_logloss: 0.515609
[3]	valid_0's auc: 0.757307	valid_0's binary_logloss: 0.503652
[4]	valid_0's auc: 0.761909	valid_0's binary_logloss: 0.495659
[5]	valid_0's auc: 0.766147	valid_0's binary_logloss: 0.489593
[6]	valid_0's auc: 0.76908	valid_0's binary_logloss: 0.485385
[7]	valid_0's auc: 0.771969	valid_0's binary_logloss: 0.481881
[8]	valid_0's auc: 0.774313	valid_0's binary_logloss: 0.479136
[9]	valid_0's auc: 0.777105	valid_0's binary_logloss: 0.476342
[10]	valid_0's auc: 0.779307	valid_0's binary_logloss: 0.474206
[11]	valid_0's auc: 0.780915	valid_0's binary_logloss: 0.47253
[12]	valid_0's auc: 0.783147	valid_0's binary_logloss: 0.470589
[13]	valid_0's auc: 0.784379	valid_0's binary_logloss: 0.469395
[14]	valid_0's auc: 0.785222	valid_0's binary_logloss: 0.468492
[15]	valid_0's auc: 0.786669	valid_0's binary_logloss: 0.467244
[16]	valid_0's auc: 0.787639	valid_0's binary_loglo

New categorical_feature is ['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C2', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's auc: 0.736112	valid_0's binary_logloss: 0.537492
[2]	valid_0's auc: 0.747742	valid_0's binary_logloss: 0.518303
[3]	valid_0's auc: 0.753784	valid_0's binary_logloss: 0.506506
[4]	valid_0's auc: 0.758455	valid_0's binary_logloss: 0.498555
[5]	valid_0's auc: 0.762201	valid_0's binary_logloss: 0.492777
[6]	valid_0's auc: 0.766365	valid_0's binary_logloss: 0.487938
[7]	valid_0's auc: 0.76883	valid_0's binary_logloss: 0.484706
[8]	valid_0's auc: 0.771468	valid_0's binary_logloss: 0.481753
[9]	valid_0's auc: 0.77339	valid_0's binary_logloss: 0.479543
[10]	valid_0's auc: 0.775071	valid_0's binary_logloss: 0.477716
[11]	valid_0's auc: 0.777695	valid_0's binary_logloss: 0.475406
[12]	valid_0's auc: 0.779141	valid_0's binary_logloss: 0.473969
[13]	valid_0's auc: 0.780811	valid_0's binary_logloss: 0.472579
[14]	valid_0's auc: 0.782006	valid_0's binary_logloss: 0.471429
[15]	valid_0's auc: 0.782836	valid_0's binary_logloss: 0.470603
[16]	valid_0's auc: 0.783476	valid_0's binary_loglo

New categorical_feature is ['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C2', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's auc: 0.731928	valid_0's binary_logloss: 0.531414
[2]	valid_0's auc: 0.744826	valid_0's binary_logloss: 0.512783
[3]	valid_0's auc: 0.750625	valid_0's binary_logloss: 0.501545
[4]	valid_0's auc: 0.755402	valid_0's binary_logloss: 0.493799
[5]	valid_0's auc: 0.758848	valid_0's binary_logloss: 0.488295
[6]	valid_0's auc: 0.76238	valid_0's binary_logloss: 0.483848
[7]	valid_0's auc: 0.766115	valid_0's binary_logloss: 0.48002
[8]	valid_0's auc: 0.768381	valid_0's binary_logloss: 0.477282
[9]	valid_0's auc: 0.771352	valid_0's binary_logloss: 0.47433
[10]	valid_0's auc: 0.773079	valid_0's binary_logloss: 0.472499
[11]	valid_0's auc: 0.776109	valid_0's binary_logloss: 0.469997
[12]	valid_0's auc: 0.777152	valid_0's binary_logloss: 0.468822
[13]	valid_0's auc: 0.778719	valid_0's binary_logloss: 0.467504
[14]	valid_0's auc: 0.779568	valid_0's binary_logloss: 0.46658
[15]	valid_0's auc: 0.781205	valid_0's binary_logloss: 0.465163
[16]	valid_0's auc: 0.781896	valid_0's binary_logloss

New categorical_feature is ['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C2', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's auc: 0.733739	valid_0's binary_logloss: 0.535278
[2]	valid_0's auc: 0.74665	valid_0's binary_logloss: 0.516193
[3]	valid_0's auc: 0.751573	valid_0's binary_logloss: 0.505209
[4]	valid_0's auc: 0.755737	valid_0's binary_logloss: 0.497775
[5]	valid_0's auc: 0.759991	valid_0's binary_logloss: 0.491808
[6]	valid_0's auc: 0.762858	valid_0's binary_logloss: 0.487618
[7]	valid_0's auc: 0.765807	valid_0's binary_logloss: 0.484133
[8]	valid_0's auc: 0.767729	valid_0's binary_logloss: 0.481716
[9]	valid_0's auc: 0.770738	valid_0's binary_logloss: 0.478853
[10]	valid_0's auc: 0.772903	valid_0's binary_logloss: 0.47679
[11]	valid_0's auc: 0.775703	valid_0's binary_logloss: 0.474357
[12]	valid_0's auc: 0.776846	valid_0's binary_logloss: 0.47316
[13]	valid_0's auc: 0.778794	valid_0's binary_logloss: 0.471506
[14]	valid_0's auc: 0.779691	valid_0's binary_logloss: 0.470593
[15]	valid_0's auc: 0.780805	valid_0's binary_logloss: 0.469626
[16]	valid_0's auc: 0.781805	valid_0's binary_loglos

In [12]:
##### refit network test
if mode == 'SIR':
    for ind, (train_data, eval_data) in enumerate(zip(data_list[:-1],data_list[1:])):
        print('use',train_data.split('/')[-1],'for training,', eval_data.split('/')[-1], 'for evaluation')
        nx = pd.read_csv(train_data,
                        names=['label']+I_col_names + C_col_names,
                        )
        gbm.refit_threshold(nx.drop(['label'],axis=1), nx['label'],decay_rate= 0)
        refit_new_booster = gbm.refit(nx.drop(['label'],axis=1), nx['label'],decay_rate= 0)
        del nx
        gbm_json = gbm.dump_model()
        gbm_string = gbm.model_to_string()
        # here rnb stands for refit_new_booster
        rnb_string = refit_new_booster.model_to_string()
        tree_struct = gbm_json['tree_info']
        tree_number = len(tree_struct)
        gbm_leaves_cnt = np.zeros((tree_number, param['num_leaves']), dtype=np.uint32)
        refit_leaves_cnt = np.zeros((tree_number, param['num_leaves']), dtype=np.uint32)
        # These codes here are very dirty
        for i in range(0,2):
            tree_index = -1
            booster_string = rnb_string if i > 0 else gbm_string
            for bis in booster_string.split('\n'):
                # bis: booster_info_string
                if bis[0:len("Tree")]=="Tree":
                    # record which tree is being dealt with
                    tree_index += 1
                if bis[0:len("leaf_count")]=="leaf_count":
                    # This code block could be "simplified" with reg expressions
                    leaf_cnt = bis.split("=")
                    leaf_cnt = leaf_cnt[1]
                    leaf_cnt = leaf_cnt.split(" ")
                    for leaf_index, element in enumerate(leaf_cnt):
                        if i:
                            refit_leaves_cnt[tree_index, leaf_index] = np.uint32(element)
                        else:
                            gbm_leaves_cnt[tree_index, leaf_index] = np.uint32(element)

        # We still need a leafcnt matrix of the original data set
        # Since the newly obtained refit booster has the same structure as the original booster
        for treeId in range(0, tree_number):
            for leafId in range(0, tree_struct[treeId]['num_leaves']):
                gbm_leaf_cnt = gbm_leaves_cnt[treeId, leafId]
                refit_leaf_cnt = refit_leaves_cnt[treeId, leafId]
                gbm_leaf_output = gbm.get_leaf_output(treeId, leafId)
                refit_leaf_output = refit_new_booster.get_leaf_output(treeId, leafId)
                new_output = gbm_leaf_cnt / (gbm_leaf_cnt + refit_leaf_cnt) * gbm_leaf_output + refit_leaf_cnt / (gbm_leaf_cnt + refit_leaf_cnt) * refit_leaf_output
                set_leaf_output(gbm, treeId, leafId, new_output)
    #    del nx
        gc.collect()
        gt = pd.read_csv(eval_data,
                        names=['label']+I_col_names + C_col_names)
        pred = gbm.predict(gt.drop(['label'],axis = 1))
        print(metrics.log_loss(gt['label'], pred),'log_loss')
        print(metrics.roc_auc_score(gt['label'], pred),'auc')
        del gt
        gc.collect()

In [13]:
if mode == "raw_test":
    for ind, (train_data, eval_data) in enumerate(zip(data_list[:-1],data_list[1:])):
        print('use', eval_data.split('/')[-1], 'for evaluation')
        gc.collect()
        gt = pd.read_csv(eval_data,
                        names=['label']+I_col_names + C_col_names)
        pred = gbm.predict(gt.drop(['label'],axis = 1))
        print(metrics.log_loss(gt['label'], pred),'log_loss')
        print(metrics.roc_auc_score(gt['label'], pred),'auc')
        del gt
        gc.collect()