In [1]:
import io
from contextlib import redirect_stdout
from copy import deepcopy
from dataclasses import dataclass, asdict
from hyperopt import fmin, tpe, hp, space_eval
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [2]:
dir_path = '~/Desktop/final'
train_origin = pd.read_csv(dir_path + '/train_final.csv', engine='python')
test_origin = pd.read_csv(dir_path + '/test_final.csv', engine='python')

In [3]:
TARGET_ENCODING_MAP = {}

def get_kfold_target_encoding(data, col_prefix, label, n_splits=5):
    kfold = KFold(n_splits=n_splits)
    col_list = [col for col in data.columns if col.startswith(col_prefix)]
    col_map = {col:{'count':0, 'value':0, 'weight':0} for col in col_list}
    for other_id, current_id in kfold.split(data):
        current_df = data.loc[current_id]
        other_df = data.loc[other_id]
        for col_name in col_list:
            x_sum = len(other_df[other_df[col_name] == 1])
            y_sum = float(other_df[other_df[col_name] == 1][label].sum())
            x_count = len(current_df[current_df[col_name] == 1])
            col_map[col_name]['count'] += x_count
            col_map[col_name]['value'] += x_count * y_sum / x_sum
    
    for k in col_map:
        col_map[k]['weight'] = col_map[k]['value'] / col_map[k]['count']
    return col_map


def append_target_encoding(data, col_prefix, label, drop=True):
    def get_feature_value(x, col_list, col_map):
        val_list = [x[i] for i in col_list]
        return col_map[col_list[val_list.index(1)]]['weight']
    
    if col_prefix not in TARGET_ENCODING_MAP:
        col_map = get_kfold_target_encoding(data, col_prefix, label)
        TARGET_ENCODING_MAP[col_prefix] = col_map
    else:
        col_map = TARGET_ENCODING_MAP[col_prefix]
    
    col_list = [col for col in data.columns if col_prefix in col] 
    new_col = 'new_' + col_prefix + '_target_encoding'
    
    data[new_col] = data.apply(lambda x: get_feature_value(x, col_list, col_map), axis=1)
    if drop:
        data.drop(columns=col_list, inplace=True)
        

def process_data(data_origin, label, flag):
    data = data_origin.copy()
    if flag:
        data['r01'] = data['discrete_emp_length_12_one_hot']
        data['r02'] = data['discrete_emp_length_5_one_hot']
        data['r03'] = data['discrete_emp_length_7_one_hot']
        data['r04'] = data['discrete_addr_state_37_one_hot'] 
        data['r05'] = data['discrete_addr_state_43_one_hot']
        data['r06'] = data['discrete_addr_state_8_one_hot']
        data['r07'] = data['discrete_addr_state_25_one_hot']
        data['r08'] = data['discrete_addr_state_4_one_hot']
        data['r09'] = data['discrete_addr_state_15_one_hot']
        data['r10'] = data['discrete_addr_state_23_one_hot']
        data['r11'] = data['discrete_addr_state_12_one_hot']
        data['r12'] = data['discrete_addr_state_6_one_hot']
        data['r13'] = data['discrete_grade_2_one_hot']
        data['r14'] = data['discrete_sub_grade_13_one_hot']
        data['r15'] = data['discrete_sub_grade_4_one_hot']
        data['r16'] = data['discrete_sub_grade_6_one_hot']
        data['r17'] = data['discrete_purpose_6_one_hot']

        append_target_encoding(data, 'discrete_grade', label, True)
        append_target_encoding(data, 'discrete_sub_grade', label, True)
        append_target_encoding(data, 'discrete_purpose', label, True)
        append_target_encoding(data, 'discrete_emp_length', label, True)
        append_target_encoding(data, 'discrete_addr_state', label, True)
    
#     data['d1'] = data['continuous_installment'] / (data['continuous_annual_inc']/12)
#     data['d2'] = data['d1'] * data['continuous_dti']
#     data['d3'] = data['d1'] * data['continuous_int_rate']
#     data['d4'] = data['continuous_loan_amnt'] / data['continuous_funded_amnt']
#     data['d5'] = data['continuous_annual_inc'] / data['continuous_annual_inc_joint']
#     data['d6'] = data['continuous_dti'] * data['continuous_dti_joint']
    
#     data['k1'] = data['continuous_mths_since_last_record'] / data['continuous_mths_since_last_delinq']
#     data['k2'] = data['k1'] * data['continuous_inq_last_6mths']
    
#     data['t1'] = data['continuous_fico_range_high'] - data['continuous_last_fico_range_high']
#     data['t2'] = data['continuous_fico_range_high'] + data['continuous_last_fico_range_high']
    
#     data['w1'] = data['new_discrete_purpose_target_encoding'] * data['new_discrete_addr_state_target_encoding']
#     data['w2'] = data['new_discrete_emp_length_target_encoding'] * data['continuous_pub_rec']
    
#     data.drop(columns = ['discrete_policy_code_1_one_hot',
#                          'discrete_pymnt_plan_1_one_hot',
#                         ], inplace=True)
    
#    return data.sample(frac=1).reset_index().drop(columns='index')
    return data


In [4]:
@dataclass
class LGBOpt:
    num_thread: any = hp.choice('num_thread', [6])    # cpu_count
    num_leaves: any = hp.choice('num_leaves', [4, 8, 16, 20, 24, 32, 40, 48, 56, 64])
    metric: any = hp.choice('metric', ['binary'])
    num_round: any = hp.choice('num_round', [2000])
    objective: any = hp.choice('objective', ['binary'])
    learning_rate: any = hp.uniform('learning_rate', 0.01, 0.1)
    feature_fraction: any = hp.uniform('feature_fraction', 0.5, 1.0)
    bagging_fraction: any = hp.uniform('bagging_fraction', 0.8, 1.0)
    device_type: any = hp.choice('device_type',['cpu'])  # hp.choice('device_tpye', ['gpu']) 
    boosting: any = hp.choice('boosting', ['gbdt', 'dart', 'goss'])
    extra_trees: any = hp.choice('extra_tress', [False, True])
    drop_rate: any = hp.uniform('drop_rate', 0, 0.2)
    uniform_drop: any = hp.choice('uniform_drop', [True, False])
    lambda_l1: any = hp.uniform('lambda_l1', 0, 10)  # TODO: Check range
    lambda_l2: any = hp.uniform('lambda_l2', 0, 10)  # TODO: Check range
    min_gain_to_split: any = hp.uniform('min_gain_to_split', 0, 1)  # TODO: Check range
    min_data_in_bin: any = hp.choice('min_data_in_bin', [3, 5, 7, 10, 15, 20, 25, 30, 40, 50])
    #max_depth: any = hp.choice('max_depth', [3, 4, 5, 6, 7, 8])

    @staticmethod
    def get_common_params():
        return {
            'num_thread': 4, 
            'num_leaves': 12, 
            'metric': 'binary', 
            'objective': 'binary',
            'num_round': 1000, 
            'learning_rate': 0.01, 
            'feature_fraction': 0.8, 
            'bagging_fraction': 0.8,
        }
    

In [5]:
class FitterBase(object):
    def __init__(self, label, metric, max_eval=100, opt=None):
        self.label = label
        self.metric = metric
        self.opt_params = dict()
        self.max_eval = max_eval
        self.opt = opt

    def get_loss(self, y, y_pred):
        if self.metric == 'error':
            return 1 - accuracy_score(y, y_pred)
        elif self.metric == 'precision':
            return 1 - precision_score(y, y_pred)
        elif self.metric == 'recall':
            return 1 - recall_score(y, y_pred)
        elif self.metric == 'macro_f1':
            return 1 - f1_score(y, y_pred, average='macro')
        elif self.metric == 'micro_f1':
            return 1 - f1_score(y, y_pred, average='micro')
        elif self.metric == 'auc':  # TODO: Add a warning checking if y_predict is all [0, 1], it should be probability
            return 1 - roc_auc_score(y, y_pred)
        else:
            raise Exception("Not implemented yet.")

In [6]:
class LGBFitter(FitterBase):
    
    def __init__(self, label='label', metric='error', opt: LGBOpt = None, max_eval=100):
        super(LGBFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = LGBOpt()
        self.best_round = None
        self.clf = None

        
    def train(self, train_df, eval_df, params=None, use_best_eval=True):
        # 训练前清空 self.best_round
        self.best_round = None
        dtrain = lgb.Dataset(train_df.drop(columns=[self.label]), train_df[self.label])
        deval = lgb.Dataset(eval_df.drop(columns=[self.label]), eval_df[self.label])
        evallist = [dtrain, deval]
        # 若 params 参数为空， 则使用 opt 搜索的参数
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)

        num_round = use_params.pop('num_round')
        # 默认使用 self.best_round 为最小错误率的轮数
        if use_best_eval:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            min_error = np.inf
            min_index = 0
            for idx in range(len(output) - 1):
                if len(output[idx].split("\t")) == 3:
                    temp = float(output[idx].split("\t")[2].split(":")[1])
                    if min_error > temp:
                        min_error = temp
                        min_index = int(output[idx].split("\t")[0][1:-1])
            print("The minimum is attained in round %d" % (min_index + 1))
            self.best_round = min_index + 1
            return output
        else:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            self.best_round = num_round
            return output

        
    # 单模型搜索 self.opt_params
    def search(self, train_df, eval_df, use_best_eval=True):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params, use_best_eval)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round) > 0.5).astype(int)  
            return self.get_loss(eval_df[self.label], y_pred)
        
        argmin_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)
        self.opt_params = space_eval(asdict(self.opt), argmin_params)

        
    # k-fold模型搜索 self.opt_params
    def search_k_fold(self, k_fold, data, use_best_eval=True):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.loc[train_id]
                eval_df = data.loc[eval_id]
                self.train(train_df, eval_df, params, use_best_eval)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
                else:
                    y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round) > 0.5).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)

        argmin_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)
        self.opt_params = space_eval(asdict(self.opt), argmin_params)
        
        
    def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True, use_best_eval=True):
        acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        test_pred_list = list()
        models = list()
        
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data

        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.loc[train_id]
            eval_df = train_data.loc[eval_id]
            self.train(train_df, eval_df, params, use_best_eval)
            models.append(deepcopy(self.clf))
            train_pred[eval_id] = self.clf.predict(eval_df.drop(columns=self.label), num_iteration=self.best_round)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round) > 0.5).astype(int)
            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            new_test_pred = self.clf.predict(dtest, num_iteration=self.best_round)
            test_pred += new_test_pred
            test_pred_list.append(new_test_pred)
        
        test_pred /= k_fold.n_splits
        
        if self.metric != 'auc':
            train_pred = (train_pred > 0.5).astype(int)
            test_pred = (test_pred > 0.5).astype(int)
        test_acc = self.get_loss(test_data[self.label], test_pred)
        
        return {
            'models'     : models,
            'train_pred' : train_pred,
            'test_pred'  : test_pred,
            'acc_result' : [round(i, 5) for i in acc_result],
            'acc_result_mean' : round(np.mean([round(i, 5) for i in acc_result]), 5),
            'test_pred_list'    : test_pred_list,
            'test_acc'   : round(test_acc, 5),
        }

In [7]:
fitter = LGBFitter(label='loan_status')
kfold = KFold(n_splits=5)

#### 调参

In [11]:
#fitter.search_k_fold(kfold, train, use_best_eval=True)

In [12]:
#fitter.opt_params

#### 第1次测试

In [8]:
p1 = {'bagging_fraction': 0.9379122665132016,
 'boosting': 'goss',
 'device_type': 'cpu',
 'drop_rate': 0.0003757191912043406,
 'extra_trees': True,
 'feature_fraction': 0.7837989424839368,
 'lambda_l1': 2.4791237795383174,
 'lambda_l2': 5.423097004780251,
 'learning_rate': 0.027638820456068754,
 'metric': 'binary',
 'min_data_in_bin': 20,
 'min_gain_to_split': 0.9418873172596369,
 'num_leaves': 64,
 'num_round': 2000,
 'num_thread': 6,
 'objective': 'binary',
 'uniform_drop': False}

train = process_data(train_origin, 'loan_status', False)
test = process_data(test_origin, 'loan_status', False)
data = pd.concat([train, test], axis=0)
result_map = fitter.train_k_fold(kfold, train, test, params = p1)
result_map['acc_result'], result_map['acc_result_mean'], result_map['test_acc']

The minimum is attained in round 487
Finished loading model, total used 2000 iterations
The minimum is attained in round 322
Finished loading model, total used 2000 iterations
The minimum is attained in round 444
Finished loading model, total used 2000 iterations
The minimum is attained in round 495
Finished loading model, total used 2000 iterations
The minimum is attained in round 625
Finished loading model, total used 2000 iterations


([0.0719, 0.0814, 0.0829, 0.0822, 0.0776], 0.0792, 0.08176)

#### 第2次测试

In [9]:
p2 = {'bagging_fraction': 0.8820761355932795,
 'boosting': 'gbdt',
 'device_type': 'cpu',
 'drop_rate': 0.15221361387442056,
 'extra_trees': True,
 'feature_fraction': 0.7966085155023439,
 'lambda_l1': 9.05999616675501,
 'lambda_l2': 2.514529861096307,
 'learning_rate': 0.09924903847628047,
 'metric': 'binary',
 'min_data_in_bin': 5,
 'min_gain_to_split': 0.06684328574175302,
 'num_leaves': 8,
 'num_round': 2000,
 'num_thread': 6,
 'objective': 'binary',
 'uniform_drop': False}

train = process_data(train_origin, 'loan_status', True)
test = process_data(test_origin, 'loan_status', True)
data = pd.concat([train, test], axis=0)
result_map = fitter.train_k_fold(kfold, train, test, params = p2)
result_map['acc_result'], result_map['acc_result_mean'], result_map['test_acc']

The minimum is attained in round 331
Finished loading model, total used 613 iterations
The minimum is attained in round 710
Finished loading model, total used 626 iterations
The minimum is attained in round 1876
Finished loading model, total used 563 iterations
The minimum is attained in round 1977
Finished loading model, total used 620 iterations
The minimum is attained in round 353
Finished loading model, total used 629 iterations


([0.0706, 0.0803, 0.0827, 0.0828, 0.0778], 0.07884, 0.08166)

#### 第3次测试

In [10]:
p3 =  {'bagging_fraction': 0.8470798531544796,
 'boosting': 'goss',
 'device_type': 'cpu',
 'drop_rate': 0.04362493523196063,
 'extra_trees': True,
 'feature_fraction': 0.8416459394914348,
 'lambda_l1': 5.246746455101715,
 'lambda_l2': 5.4885609875994685,
 'learning_rate': 0.0361569056025585,
 'metric': 'binary',
 'min_data_in_bin': 30,
 'min_gain_to_split': 0.265040733262803,
 'num_leaves': 24,
 'num_round': 2000,
 'num_thread': 6,
 'objective': 'binary',
 'uniform_drop': True}

train = process_data(train_origin, 'loan_status', True)
test = process_data(test_origin, 'loan_status', True)
data = pd.concat([train, test], axis=0)
result_map = fitter.train_k_fold(kfold, train, test, params = p3)
result_map['acc_result'], result_map['acc_result_mean'], result_map['test_acc']

The minimum is attained in round 687
Finished loading model, total used 2000 iterations
The minimum is attained in round 649
Finished loading model, total used 1999 iterations
The minimum is attained in round 487
Finished loading model, total used 1999 iterations
The minimum is attained in round 1612
Finished loading model, total used 2000 iterations
The minimum is attained in round 407
Finished loading model, total used 2000 iterations


([0.0709, 0.0791, 0.0829, 0.084, 0.0789], 0.07916, 0.08148)

#### 观察特征重要性

In [None]:
# pd.DataFrame({
#         'column': train.drop(columns='loan_status').columns,
#         'importance': result_map['models'][0].feature_importance(),
# }).sort_values(by='importance', ascending=False).head(50)


调参方式： 统一采用 K-fold 的 hyperopt 调参， 使用”分类错误率“衡量结果

第一次测试： （使用原始数据，无任何衍生变量)

    训练数据结果： 
    Kfold 错误率：[0.0719, 0.0814, 0.0829, 0.0822, 0.0776]  错误率均值：0.07920
    测试集效果：  0.08176
    


第二次测试：  构造衍生变量（kfold target encoding）

    训练数据结果： 
    Kfold 错误率：[0.0706, 0.0803, 0.0827, 0.0828, 0.0778]   错误率均值：0.07884
    测试集效果：  0.08166


第三次测试：  保持衍生变量，调参减少过拟合

    训练数据结果： 
    Kfold 错误率：[0.0709, 0.0791, 0.0829, 0.084, 0.0789]   错误率均值：0.07916
    测试集效果：  0.08148




参数记录<br>
第一次测试参数<br>

`{'bagging_fraction': 0.9379122665132016,
 'boosting': 'goss',
 'device_type': 'cpu',
 'drop_rate': 0.0003757191912043406,
 'extra_trees': True,
 'feature_fraction': 0.7837989424839368,
 'lambda_l1': 2.4791237795383174,
 'lambda_l2': 5.423097004780251,
 'learning_rate': 0.027638820456068754,
 'metric': 'binary',
 'min_data_in_bin': 20,
 'min_gain_to_split': 0.9418873172596369,
 'num_leaves': 64,
 'num_round': 2000,
 'num_thread': 6,
 'objective': 'binary',
 'uniform_drop': False}`
 
第二次测试参数<br>
 `{'bagging_fraction': 0.8820761355932795,
 'boosting': 'gbdt',
 'device_type': 'cpu',
 'drop_rate': 0.15221361387442056,
 'extra_trees': True,
 'feature_fraction': 0.7966085155023439,
 'lambda_l1': 9.05999616675501,
 'lambda_l2': 2.514529861096307,
 'learning_rate': 0.09924903847628047,
 'metric': 'binary',
 'min_data_in_bin': 5,
 'min_gain_to_split': 0.06684328574175302,
 'num_leaves': 8,
 'num_round': 2000,
 'num_thread': 6,
 'objective': 'binary',
 'uniform_drop': False}`
 
第三次测试参数<br>
 `{'bagging_fraction': 0.8470798531544796,
 'boosting': 'goss',
 'device_type': 'cpu',
 'drop_rate': 0.04362493523196063,
 'extra_trees': True,
 'feature_fraction': 0.8416459394914348,
 'lambda_l1': 5.246746455101715,
 'lambda_l2': 5.4885609875994685,
 'learning_rate': 0.0361569056025585,
 'metric': 'binary',
 'min_data_in_bin': 30,
 'min_gain_to_split': 0.265040733262803,
 'num_leaves': 24,
 'num_round': 2000,
 'num_thread': 6,
 'objective': 'binary',
 'uniform_drop': True}`

