In [127]:
# -*- coding: utf-8 -*-
import os
import sys
import time
import pandas as pd
import numpy as np
import torch
from torch.optim import Adagrad
from sklearn.metrics import log_loss, roc_auc_score, mean_squared_error, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import xgboost as xgb
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor
import warnings

sys.path.append("/Users/wzq/Desktop/game")
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
from deepctr_torch.callbacks import EarlyStopping, ModelCheckpoint

# jupyter配置
from IPython.display import display
pd.options.display.max_rows=10000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=100000#Notebook 的一个cell的显示列数
pd.set_option('display.max_colwidth', None)

In [52]:
data = pd.read_csv('./data/data_processed.csv')
print(data.shape)
display(data.columns)

(1000000, 54)


Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDate_year',
       'issueDate_month', 'issueDate_day', 'issueDate_weekday',
       'issueDate_is_weekend', 'earliesCreditLine_year',
       'earliesCreditLine_month'],
      dtype='object')

In [53]:
features = [f for f in data.columns if f not in ["issueDate", "isDefault", "earliesCreditLine", "policyCode"]]
target = data.iloc[:800000, :]['isDefault']
data = data[features]
train_data = data.iloc[:800000, :]
test_data = data.iloc[800000:, :]
display(train_data.shape)
display(train_data.head(10))
display(test_data.shape)
display(test_data.head(10))
display(target.shape)
display(target.head(10))

(800000, 50)

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,title,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,issueDate_month,issueDate_day,issueDate_weekday,issueDate_is_weekend,earliesCreditLine_year,earliesCreditLine_month
0,0,0.873418,1.0,0.553349,0.531301,4,21,305,0.2,2,0.010001,2,1,138,32,0.01805,0.0,0.477273,0.475113,0.077778,0.0,0.0,0.008323,0.054802,0.15625,0,0,2,0.0,0.060606,0.031746,0.031746,0.063492,0.128571,0.060606,0.048193,0.09375,0.044444,0.077778,0.0,0.0,0.0,0.066667,0.636364,0.545455,0.0,0.166667,0.0,0.802817,0.636364
1,1,0.443038,1.0,0.51324,0.263246,3,16,176661,0.5,0,0.004182,2,0,157,18,0.02883,0.0,0.340909,0.339367,0.144444,0.0,0.0,0.005197,0.043595,0.1,1,0,1510,0.0,0.0,0.0,0.0,0.15873,0.0,0.0,0.0,0.0,0.0,0.144444,0.0,0.0,0.0,0.0,0.454545,0.636364,0.0,0.333333,0.0,0.816901,0.363636
2,2,0.291139,1.0,0.454829,0.167014,3,17,27721,0.8,0,0.006728,2,0,338,14,0.02377,0.0,0.227273,0.226244,0.122222,0.0,0.0,0.001586,0.058052,0.15625,0,0,1,0.0,0.0,0.047619,0.047619,0.0,0.0,0.159091,0.048193,0.039062,0.066667,0.122222,0.0,0.0,0.0,0.133333,0.727273,0.818182,0.0,0.5,0.0,0.873239,0.363636
3,3,0.265823,0.0,0.075935,0.192164,0,3,40375,1.0,1,0.010728,1,4,149,11,0.01821,0.0,0.272727,0.271493,0.1,0.0,0.0,0.003425,0.058949,0.1625,1,0,5,0.117647,0.121212,0.095238,0.095238,0.063492,0.228571,0.030303,0.084337,0.164062,0.133333,0.1,0.0,0.0,0.0,0.033333,0.727273,0.636364,0.0,0.833333,1.0,0.774648,0.363636
4,4,0.063291,0.0,0.299065,0.051169,2,11,51,0.0,1,0.002637,2,10,302,21,0.03316,0.0,0.295455,0.294118,0.133333,0.0,0.0,0.001013,0.035862,0.15625,0,0,12,0.019608,0.060606,0.111111,0.111111,0.031746,0.057143,0.068182,0.120482,0.117188,0.155556,0.133333,0.0,0.0,0.0,0.133333,0.818182,0.181818,0.0,0.166667,0.0,0.464789,0.636364
5,5,0.265823,0.0,0.104361,0.194333,0,4,44407,0.7,0,0.003546,2,9,513,21,0.01814,0.0,0.477273,0.475113,0.211111,0.0,0.0,0.001393,0.034854,0.3125,1,0,11,0.235294,0.030303,0.031746,0.031746,0.015873,0.014286,0.363636,0.024096,0.023438,0.044444,0.211111,0.0,0.0,0.0,0.0,0.909091,0.272727,0.0,0.833333,1.0,0.760563,0.545455
6,6,0.039241,0.0,0.092679,0.029352,0,3,146449,0.9,0,0.003182,0,0,518,14,0.01849,0.0,0.590909,0.588235,0.133333,0.0,0.0,0.001071,0.009526,0.13125,0,0,1,0.0,0.030303,0.047619,0.047619,0.111111,0.157143,0.022727,0.120482,0.140625,0.066667,0.133333,0.0,0.0,0.0,0.1,0.636364,0.818182,0.0,0.333333,0.0,0.873239,0.818182
7,7,0.278481,0.0,0.376558,0.226007,2,12,172154,0.1,1,0.002727,2,0,101,4,0.0336,0.0,0.181818,0.180995,0.088889,0.011628,0.083333,0.004827,0.066906,0.19375,1,0,1,0.0,0.121212,0.063492,0.063492,0.063492,0.228571,0.075758,0.060241,0.164062,0.088889,0.088889,0.0,0.0,0.0,0.066667,0.636364,0.0,0.0,0.333333,0.0,0.704225,1.0
8,8,0.291139,0.0,0.299065,0.229374,2,11,181,0.5,2,0.005455,1,0,793,13,0.02022,0.0,0.295455,0.294118,0.166667,0.0,0.0,0.009355,0.051552,0.11875,1,0,1,0.0,0.212121,0.206349,0.206349,0.111111,0.1,0.015152,0.156627,0.132812,0.244444,0.166667,0.0,0.0,0.0,0.2,0.818182,0.363636,0.0,1.0,1.0,0.704225,0.272727
9,9,0.151899,0.0,0.221184,0.116827,1,8,51,0.0,1,0.001391,2,0,60,11,0.02539,0.0,0.454545,0.452489,0.077778,0.0,0.0,0.001011,0.034293,0.38125,0,0,1,0.0,0.121212,0.079365,0.079365,0.063492,0.3,0.181818,0.072289,0.304688,0.111111,0.077778,0.0,0.0,0.0,0.266667,0.727273,0.909091,0.0,1.0,1.0,0.690141,0.0


(200000, 50)

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,title,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,issueDate_month,issueDate_day,issueDate_weekday,issueDate_is_weekend,earliesCreditLine_year,earliesCreditLine_month
800000,800000,0.341772,0.0,0.221184,0.261119,1,7,6368,1.0,0,0.007273,0,0,164,21,0.01156,0.025641,0.409091,0.40724,0.188889,0.0,0.0,0.00339,0.034405,0.16875,0,0,1,0.019608,0.121212,0.095238,0.095238,0.095238,0.114286,0.030303,0.180723,0.148438,0.133333,0.188889,0.0,0.0,0.025641,0.1,0.636364,0.545455,0.0,0.166667,0.0,0.422535,0.909091
800001,800001,0.493671,1.0,0.363707,0.269265,2,14,51569,1.0,0,0.004546,0,2,236,8,0.0224,0.051282,0.204545,0.20362,0.055556,0.0,0.0,0.00308,0.063432,0.075,0,0,6,0.039216,0.030303,0.047619,0.047619,0.015873,0.014286,0.022727,0.036145,0.070312,0.066667,0.055556,0.0,0.0,0.051282,0.066667,0.727273,0.545455,0.0,0.333333,0.0,0.802817,0.545455
800002,800002,0.291139,0.0,0.571651,0.253848,3,18,20756,0.2,1,0.005455,2,0,527,20,0.0345,0.0,0.386364,0.384615,0.133333,0.0,0.0,0.000334,0.019724,0.25625,1,0,1,0.0,0.030303,0.063492,0.063492,0.015873,0.014286,0.272727,0.060241,0.046875,0.088889,0.133333,0.0,0.0,0.0,0.233333,0.818182,0.818182,0.0,0.833333,1.0,0.873239,0.636364
800003,800003,0.43038,1.0,0.350467,0.232754,2,13,592,0.4,0,0.003364,1,4,249,11,0.01495,0.0,0.272727,0.271493,0.111111,0.011628,0.083333,0.003528,0.058613,0.1,0,0,5,0.0,0.060606,0.031746,0.031746,0.063492,0.1,0.015152,0.096386,0.109375,0.044444,0.111111,0.0,0.0,0.0,0.1,0.636364,0.909091,0.0,0.833333,1.0,0.816901,0.545455
800004,800004,0.873418,0.0,0.458723,0.72611,3,15,290776,0.0,1,0.007273,1,0,116,8,0.02597,0.0,0.272727,0.271493,0.211111,0.0,0.0,0.011429,0.039897,0.125,0,0,1,0.0,0.242424,0.174603,0.174603,0.142857,0.157143,0.022727,0.192771,0.140625,0.244444,0.211111,0.0,0.0,0.0,0.033333,0.909091,0.818182,0.0,1.0,1.0,0.788732,1.0
800005,800005,0.392405,0.0,0.000389,0.274966,0,0,227,1.0,0,0.008182,0,0,481,8,0.01628,0.0,0.681818,0.678733,0.188889,0.0,0.0,0.002137,0.01356,0.20625,0,0,1,0.0,0.090909,0.079365,0.079365,0.126984,0.2,0.045455,0.156627,0.195312,0.111111,0.188889,0.0,0.0,0.0,0.1,0.909091,0.363636,0.0,0.0,0.0,0.788732,0.545455
800006,800006,0.620253,1.0,0.376947,0.341252,2,13,12,1.0,1,0.011501,2,0,311,24,0.02394,0.0,0.431818,0.429864,0.155556,0.0,0.0,0.010832,0.045164,0.14375,0,0,1,0.0,0.151515,0.111111,0.111111,0.095238,0.085714,0.045455,0.120482,0.132812,0.155556,0.155556,0.0,0.0,0.0,0.033333,0.818182,0.909091,0.0,0.166667,0.0,0.760563,0.0
800007,800007,0.101266,0.0,0.079439,0.073856,0,3,3117,0.6,0,0.004546,0,4,82,15,0.01432,0.0,0.568182,0.565611,0.088889,0.0,0.0,0.00179,0.024767,0.09375,1,1,5,0.0,0.090909,0.047619,0.047619,0.063492,0.085714,0.060606,0.048193,0.0625,0.066667,0.088889,0.0,0.0,0.0,0.033333,0.909091,0.818182,0.0,1.0,1.0,0.887324,0.090909
800008,800008,0.291139,1.0,0.437695,0.165351,3,16,327,0.3,1,0.005273,1,0,139,0,0.02417,0.051282,0.25,0.248869,0.155556,0.0,0.0,0.000476,0.020397,0.16875,0,0,1,0.019608,0.121212,0.063492,0.063492,0.063492,0.071429,0.159091,0.060241,0.0625,0.088889,0.155556,0.0,0.0,0.0,0.133333,0.727273,0.818182,0.0,0.5,0.0,0.84507,0.363636
800009,800009,0.405063,0.0,0.092679,0.294279,0,3,7325,1.0,0,0.012728,0,4,77,26,0.01493,0.0,0.409091,0.40724,0.111111,0.0,0.0,0.005826,0.084725,0.1625,0,0,5,0.039216,0.060606,0.063492,0.063492,0.031746,0.1,0.075758,0.048193,0.101562,0.088889,0.111111,0.0,0.0,0.0,0.1,0.636364,0.636364,0.0,0.666667,0.0,0.676056,0.727273


(800000,)

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    1.0
9    0.0
Name: isDefault, dtype: float64

In [121]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], 
                              callbacks = [log_evaluation(period=100), early_stopping(stopping_rounds=200)])
            # model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix])
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            if i == 0:
                # 如果重复给test更改为DMatrix，会报错。因为DMatrix不支持输入DMatrix类型的数据。
                test_x = clf.DMatrix(data=test_x)
            
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix)
            test_pred = model.predict(test_x)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [122]:
def lgb_model(x_train, y_train, x_test):
    # https://www.heywhale.com/mw/project/6585325cdcad99bb0a1f4686
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    # https://www.cnblogs.com/Mephostopheles/p/18397154
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    # https://mp.weixin.qq.com/s/xloTLr5NJBgBspMQtxPoFA
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

In [129]:
# 随机搜惨
# 设置参数范围
param_dist = {
    'boosting_type': ['gbdt', 'dart'],  # 提升类型  梯度提升决策树（gbdt）和Dropouts meet Multiple Additive Regression Trees（dart）
    'objective': ['binary'],  # 目标；二分类和多分类
    'num_leaves': range(20, 150),  # 叶子节点数量
    'learning_rate': [0.01, 0.05, 0.1],  # 学习率
    'feature_fraction': [0.6, 0.8, 1.0],  # 特征采样比例
    'bagging_fraction': [0.6, 0.8, 1.0],  # 数据采样比例
    'bagging_freq': range(0, 80),  # 数据采样频率
    'verbose': [-1]  # 是否显示训练过程中的详细信息，-1表示不显示
}

# 初始化模型
model = lgb.LGBMClassifier()


# 使用随机搜索进行参数调优
random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_dist, # 参数组合
                                   n_iter=100, 
                                   cv=5, # 5折交叉验证
                                   verbose=2, 
                                   random_state=42, 
                                   n_jobs=-1)
# 模型训练
random_search.fit(train_data, target)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bagging_fraction=0.6, bagging_freq=52, boosting_type=gbdt, feature_fraction=0.6, learning_rate=0.1, num_leaves=38, objective=binary, verbose=-1; total time=  20.6s
[CV] END bagging_fraction=0.6, bagging_freq=52, boosting_type=gbdt, feature_fraction=0.6, learning_rate=0.1, num_leaves=38, objective=binary, verbose=-1; total time=  21.5s
[CV] END bagging_fraction=0.6, bagging_freq=52, boosting_type=gbdt, feature_fraction=0.6, learning_rate=0.1, num_leaves=38, objective=binary, verbose=-1; total time=  22.3s
[CV] END bagging_fraction=0.6, bagging_freq=52, boosting_type=gbdt, feature_fraction=0.6, learning_rate=0.1, num_leaves=38, objective=binary, verbose=-1; total time=  22.4s
[CV] END bagging_fraction=0.6, bagging_freq=52, boosting_type=gbdt, feature_fraction=0.6, learning_rate=0.1, num_leaves=38, objective=binary, verbose=-1; total time=  22.4s
[CV] END bagging_fraction=0.8, bagging_freq=76, boosting_type=gbdt, feat

In [125]:
lgb_train, lgb_test = lgb_model(train_data, target, test_data)
# xgb_train, xgb_test = xgb_model(train_data, target, test_data)
# cat_train, cat_test = cat_model(train_data, target, test_data)

************************************ 1 ************************************
0:	learn: 0.3985439	test: 0.3966321	best: 0.3966321 (0)	total: 93.8ms	remaining: 31m 15s
500:	learn: 0.3762061	test: 0.3748515	best: 0.3748515 (500)	total: 10.2s	remaining: 6m 36s
1000:	learn: 0.3747082	test: 0.3741543	best: 0.3741543 (1000)	total: 19.2s	remaining: 6m 4s
1500:	learn: 0.3736793	test: 0.3738855	best: 0.3738855 (1500)	total: 28.9s	remaining: 5m 56s
2000:	learn: 0.3728107	test: 0.3737081	best: 0.3737078 (1999)	total: 39s	remaining: 5m 51s
2500:	learn: 0.3720129	test: 0.3736004	best: 0.3736004 (2500)	total: 48.9s	remaining: 5m 42s
3000:	learn: 0.3712533	test: 0.3735040	best: 0.3735035 (2997)	total: 58.1s	remaining: 5m 29s
3500:	learn: 0.3705277	test: 0.3734583	best: 0.3734562 (3495)	total: 1m 6s	remaining: 5m 12s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3734351314
bestIteration = 3741

Shrink model to first 3742 iterations.
[0.7368030620965964]
****************************

In [126]:
# sample_result = pd.read_csv("./data/sample_submit.csv")
# sample_result["isDefault"] = cat_test
# sample_result.to_csv("./data/sample_result_.csv", index=False)