In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import logging
import warnings
from pathlib import Path 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cgb
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization

pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.4f' % x)
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

### 加载数据

In [4]:
data = pd.read_feather('data/data_for_model_v2.feather')
logging.info(f'data shape: {data.shape}')
data.head()

2021-01-26 10:25:21,268 INFO: data shape: (1000000, 77)


Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,years_between_issueDate_and_earliesCreditLine,issueDate_to_now_days,earliesCreditLine_to_now_days,ficoRange,revolBal_p_totalAcc,totalAcc_m_openAcc,loanAmnt_term,annualIncome_loanAmnt_term,debt,pro,installment_annualIncome,loanAmnt_applicationType,eny_num,int_sub,pro_dti,top1,rec,rec_rate,n_feat_min,n_feat_max,n_feat_sum,n_feat_mean,n_feat_median,n_feat_skew,n_feat_std,n_feat_mode,n_feat_range,n_feat_Q25,n_feat_Q75,sample
0,0,10.4609,5,3.0215,6.8242,4,21,5.7695,2.0,2,11.6094,2,2014-07-01,1.0,1,4.9258,3.4961,2.8926,0.0,730.0,734.0,2.0801,0.0,0.0,10.0938,3.9102,3.332,0,0,Aug-2001,0.6934,1.0,0.0,2.0,2.0,2.0,4.0,2.3027,2.1973,1.6094,2.5645,2.0,2.0801,0.0,0.0,0.0,2.0,2.6387,7.7852,8.8672,4.0,10.0938,3.0449,8.8516,2.8164,14.4453,4.5898,0.0083,10.4609,6.3086,0.6573,1.9057,14.3828,0.0,0.0,0.0,12.0,52.0,3.7148,2.0,0.9512,3.8516,1.0,12.0,0.5,6.25,train
1,1,9.7969,5,2.9707,6.1367,3,16,12.2969,5.0,0,10.7344,2,2012-08-01,0.0,0,5.0547,2.9453,3.3613,0.0,700.0,704.0,2.6387,0.0,0.0,9.625,3.6855,2.9453,1,0,May-2002,7.4531,1.0,,,,,10.0,,,,,,2.6387,,,,,2.3984,8.0391,8.8281,4.0,9.625,1.792,8.1875,2.623,14.0625,4.5391,0.01,9.7969,6.5312,0.7681,1.4637,14.5938,0.0,0.0,10.0,13.0,23.0,11.5,11.5,,2.1211,11.5,3.0,10.75,12.25,train
2,2,9.3906,5,2.8906,5.6992,3,17,10.3672,8.0,0,11.2109,2,2015-10-01,0.0,0,5.8242,2.709,3.168,0.0,675.0,679.0,2.4844,0.0,0.0,8.4375,3.9668,3.332,0,0,May-2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,3.0918,1.6094,1.792,3.0,2.4844,0.0,0.0,0.0,4.0,2.3027,7.5742,8.5938,4.0,8.4375,2.834,7.7852,3.4609,14.3359,4.4531,0.004,9.3906,6.3945,0.6929,1.5541,14.0938,0.0,0.0,0.0,21.0,51.0,3.6426,1.5,2.3223,5.8906,0.0,21.0,0.0,4.0,train
3,3,9.3047,3,2.1113,5.8359,0,3,10.7578,10.0,1,11.6797,1,2015-08-01,0.0,4,5.0039,2.4844,2.9023,0.0,685.0,689.0,2.3027,0.0,0.0,9.2031,3.9824,3.3672,1,0,May-1999,1.6094,1.0,6.0,4.0,6.0,6.0,4.0,2.834,1.6094,2.0801,3.0918,6.0,2.3027,0.0,0.0,0.0,1.0,2.834,7.6055,8.9766,4.0,9.2109,2.9961,8.2109,3.502,14.5234,3.125,0.0029,9.3047,6.2109,1.2296,0.8178,14.1406,0.0,0.0,0.0,21.0,84.0,6.0,5.0,1.4424,6.0781,3.334,21.0,1.75,6.75,train
4,4,8.0078,3,2.6387,4.625,2,11,4.0078,,1,10.2734,2,2016-03-01,0.0,10,5.7109,3.0918,3.502,0.0,690.0,694.0,2.5645,0.0,0.0,7.9883,3.4961,3.332,0,0,Aug-1977,2.4844,1.0,1.0,2.0,7.0,7.0,2.0,1.6094,2.3027,2.3984,2.7734,7.0,2.5645,0.0,0.0,0.0,4.0,3.6895,7.4922,9.6719,4.0,7.9961,2.7734,6.9102,3.4004,13.7422,3.6875,0.0035,8.0078,6.0117,0.7797,0.7938,14.0312,0.0,0.0,0.0,15.0,73.0,5.2148,4.0,0.6641,4.8711,0.0,15.0,1.25,8.5,train


In [5]:
dropping_cols = ['id', 'issueDate', 'isDefault', 'earliesCreditLine', 'sample']

X_train = data.loc[data['sample'] == 'train', :].drop(dropping_cols, axis=1)
X_test = data.loc[data['sample'] == 'test', :].drop(dropping_cols, axis=1)
y_train = data.loc[data['sample'] == 'train', 'isDefault']

logging.info(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}')

2021-01-26 10:25:34,299 INFO: X_train: (800000, 72), X_test: (200000, 72), y_train: (800000,)


### LGB

In [6]:
def lgb_model(params, X_train, y_train, X_test, y_test=None, folds=5, SEED=2021):
    params['seed'] = SEED
    kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)
    
    cv_scores = []
    test_preds = None
    for i, (trn_index, val_index) in enumerate(kf.split(X_train, y_train)):
        trn_x, trn_y, val_x, val_y = X_train.iloc[trn_index], y_train[trn_index], X_train.iloc[val_index],y_train[val_index]
        train_matrix = lgb.Dataset(trn_x, label=trn_y)
        valid_matrix = lgb.Dataset(val_x, label=val_y)
        model = lgb.train(params, train_matrix, 20000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(X_test, num_iteration=model.best_iteration)

        if test_preds is None:
            test_preds = test_pred / kf.n_splits
        else:
            test_preds += test_pred / kf.n_splits
        fpr, tpr, threshold = metrics.roc_curve(val_y, val_pred)
        roc_auc = metrics.auc(fpr, tpr)
        cv_scores.append(roc_auc)
        print(f"{i} fold's ROC_AUC: {roc_auc}")

    print(f"cv scotrainre list: {cv_scores}")
    print(f"cv score mean: {np.mean(cv_scores)}")
    print(f"cv score std: {np.std(cv_scores)}")
    return test_preds

In [7]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.1,
        'n_jobs':-1,
        'verbose': -1,

        # 参数
        'bagging_fraction': 0.91,
        'bagging_freq': 1,
        'feature_fraction': 0.85,
        'max_depth': 3,
        'min_child_weight': 8.37,
        'min_data_in_leaf': 131,
        'min_split_gain': 0.13,
        'num_leaves': 12,
        'reg_alpha': 9.7,
        'reg_lambda': 2.02,
    }

test_preds = lgb_model(params, X_train, y_train, X_test)

Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.739489	valid_1's auc: 0.734789
[1000]	training's auc: 0.746127	valid_1's auc: 0.736952
[1500]	training's auc: 0.751063	valid_1's auc: 0.73785
[2000]	training's auc: 0.755135	valid_1's auc: 0.738183
[2500]	training's auc: 0.758748	valid_1's auc: 0.738208
Early stopping, best iteration is:
[2435]	training's auc: 0.758296	valid_1's auc: 0.738264
0 fold's ROC_AUC: 0.7382637274248407
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.739352	valid_1's auc: 0.734249
[1000]	training's auc: 0.74605	valid_1's auc: 0.736614
[1500]	training's auc: 0.750952	valid_1's auc: 0.737527
[2000]	training's auc: 0.75498	valid_1's auc: 0.737905
[2500]	training's auc: 0.758469	valid_1's auc: 0.73802
[3000]	training's auc: 0.761762	valid_1's auc: 0.737947
Early stopping, best iteration is:
[2514]	training's auc: 0.758567	valid_1's auc: 0.73804
1 fold's ROC_AUC: 0.7380402950641431
Training un

### 生成提交文件

In [8]:
# 生成提交格式的DataFrame
df_result = pd.DataFrame({'id': data.loc[data['sample'] == 'test', 'id'].values, 'isDefault': test_preds})
df_result.to_csv('data/tc/pred_lgb2.csv', index=False)
df_result.sort_values(by='id').head(10)

Unnamed: 0,id,isDefault
0,800000,0.0668
1,800001,0.3124
2,800002,0.6493
3,800003,0.3191
4,800004,0.3974
5,800005,0.0218
6,800006,0.3335
7,800007,0.0349
8,800008,0.6568
9,800009,0.0326
