In [11]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import logging
import warnings
from pathlib import Path 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cgb
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization

from utils import reduce_mem_usage

pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.4f' % x)
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

### 加载数据

In [2]:
data = pd.read_csv('data/data_for_model_v1.csv')
data = reduce_mem_usage(data) 
logging.info(f'data shape: {data.shape}')
data.head()

Memory usage of dataframe is 587.46 MB
2021-01-25 21:19:39,769 INFO: data shape: (1000000, 77)
Memory usage after optimization is: 167.87 MB
Decreased by 71.4%


Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,years_between_issueDate_and_earliesCreditLine,issueDate_to_now_days,earliesCreditLine_to_now_days,ficoRange,revolBal_p_totalAcc,totalAcc_m_openAcc,loanAmnt_term,annualIncome_loanAmnt_term,debt,pro,installment_annualIncome,loanAmnt_applicationType,eny_num,int_sub,pro_dti,top1,rec,rec_rate,n_feat_min,n_feat_max,n_feat_sum,n_feat_mean,n_feat_median,n_feat_skew,n_feat_std,n_feat_mode,n_feat_range,n_feat_Q25,n_feat_Q75,sample
0,0,35008.0,5,19.5156,918.0,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1.0,1,137.0,32,17.0469,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9062,27.0,0,0,Aug-2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,13.0,2394,7111,4.0,24205.0,20.0,7000.0,15.7109,1875500.0,97.625,0.0083,35008.0,547.0,0.9295,5.7243,1757196.0,0.0,0.0,0.0,12.0,52.0,3.7148,2.0,0.9512,3.8516,1.0,12.0,0.5,6.25,train
1,1,18000.0,5,18.4844,462.0,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0.0,0,156.0,18,27.8281,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9062,18.0,1,0,May-2002,1723.0,1.0,,,,,10.0,,,,,,13.0,,,,,10.0,3093,6838,4.0,15114.0,5.0,3600.0,12.7812,1280180.0,92.4375,0.01,18000.0,684.0,1.1556,3.322,2177472.0,0.0,0.0,10.0,13.0,23.0,11.5,11.5,,2.1211,11.5,3.0,10.75,12.25,train
2,2,12000.0,5,16.9844,298.25,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0.0,0,337.0,14,22.7656,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8125,27.0,0,0,May-2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,9.0,1937,5377,4.0,4633.0,16.0,2400.0,30.8281,1684980.0,84.9375,0.004,12000.0,597.5,0.9994,3.7308,1315223.0,0.0,0.0,0.0,21.0,51.0,3.6426,1.5,2.3223,5.8906,0.0,21.0,0.0,4.0,train
3,3,11000.0,3,7.2617,341.0,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0.0,4,148.0,11,17.2031,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.5938,28.0,1,0,May-1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,16.0,1998,7934,4.0,9976.0,19.0,3666.0,32.1875,2030780.0,21.7812,0.0029,11000.0,496.0,2.42,1.2655,1376622.0,0.0,0.0,0.0,21.0,84.0,6.0,5.0,1.4424,6.0781,3.334,21.0,1.75,6.75,train
4,4,3000.0,3,12.9922,101.0625,2,11,54.0,,1,29000.0,2,2016-03-01,0.0,10,301.0,21,32.1562,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,Aug-1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,39.0,1785,15877,4.0,2969.0,15.0,1000.0,29.0,932640.0,38.9688,0.0035,3000.0,407.0,1.1809,1.2118,1238790.0,0.0,0.0,0.0,15.0,73.0,5.2148,4.0,0.6641,4.8711,0.0,15.0,1.25,8.5,train


In [6]:
dropping_cols = ['id', 'issueDate', 'isDefault', 'earliesCreditLine', 'sample']

X_train = data.loc[data['sample'] == 'train', :].drop(dropping_cols, axis=1)
X_test = data.loc[data['sample'] == 'test', :].drop(dropping_cols, axis=1)
y_train = data.loc[data['sample'] == 'train', 'isDefault']

logging.info(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}')

2021-01-25 21:43:54,816 INFO: X_train: (800000, 72), X_test: (200000, 72), y_train: (800000,)


### LGB

In [17]:
def lgb_model(params, X_train, y_train, X_test, y_test=None, folds=5, SEED=2021):
    params['seed'] = SEED
    kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)
    
    cv_scores = []
    test_preds = None
    for i, (trn_index, val_index) in enumerate(kf.split(X_train, y_train)):
        trn_x, trn_y, val_x, val_y = X_train.iloc[trn_index], y_train[trn_index], X_train.iloc[val_index],y_train[val_index]
        train_matrix = lgb.Dataset(trn_x, label=trn_y)
        valid_matrix = lgb.Dataset(val_x, label=val_y)
        model = lgb.train(params, train_matrix, 20000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(X_test, num_iteration=model.best_iteration)

        if test_preds is None:
            test_preds = test_pred / kf.n_splits
        else:
            test_preds += test_pred / kf.n_splits
        fpr, tpr, threshold = metrics.roc_curve(val_y, val_pred)
        roc_auc = metrics.auc(fpr, tpr)
        cv_scores.append(roc_auc)
        print(f"{i} fold's ROC_AUC: {roc_auc}")

    print(f"cv scotrainre list: {cv_scores}")
    print(f"cv score mean: {np.mean(cv_scores)}")
    print(f"cv score std: {np.std(cv_scores)}")
    return test_preds

In [18]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.1,
        'n_jobs':-1,
        'verbose': -1,

        # 参数
        'bagging_fraction': 0.91,
        'bagging_freq': 1,
        'feature_fraction': 0.85,
        'max_depth': 3,
        'min_child_weight': 8.37,
        'min_data_in_leaf': 131,
        'min_split_gain': 0.13,
        'num_leaves': 12,
        'reg_alpha': 9.7,
        'reg_lambda': 2.02,
    }

test_preds = lgb_model(params, X_train, y_train, X_test)

Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.7395	valid_1's auc: 0.734714
[1000]	training's auc: 0.746101	valid_1's auc: 0.736785
[1500]	training's auc: 0.751046	valid_1's auc: 0.737584
[2000]	training's auc: 0.755128	valid_1's auc: 0.737961
[2500]	training's auc: 0.758688	valid_1's auc: 0.738056
[3000]	training's auc: 0.761992	valid_1's auc: 0.737946
Early stopping, best iteration is:
[2702]	training's auc: 0.760058	valid_1's auc: 0.738057
0 fold's ROC_AUC: 0.7380566705417793
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.739464	valid_1's auc: 0.734377
[1000]	training's auc: 0.746146	valid_1's auc: 0.736726
[1500]	training's auc: 0.751023	valid_1's auc: 0.737852
[2000]	training's auc: 0.755101	valid_1's auc: 0.738203
[2500]	training's auc: 0.758649	valid_1's auc: 0.738313
[3000]	training's auc: 0.761912	valid_1's auc: 0.738369
Early stopping, best iteration is:
[2975]	training's auc: 0.761756	valid_1's auc

In [19]:
test_preds

array([0.07035907, 0.31636555, 0.64085246, ..., 0.17638334, 0.27219689,
       0.02195831])

### 生成提交文件

In [20]:
# 生成提交格式的DataFrame
df_result = pd.DataFrame({'id': data.loc[data['sample'] == 'test', 'id'].values, 'isDefault': test_preds})
df_result.to_csv('data/tc/pred_lgb.csv', index=False)
df_result.sort_values(by='id').head(10)

Unnamed: 0,id,isDefault
0,800000,0.0704
1,800001,0.3164
2,800002,0.6409
3,800003,0.3067
4,800004,0.3946
5,800005,0.0203
6,800006,0.3276
7,800007,0.0364
8,800008,0.6521
9,800009,0.0336
