In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection

In [50]:
cust_data = pd.read_csv('data_transform.csv')
cust_data.shape

(100233, 126)

### Use selected features

In [51]:
cust_data = cust_data[['TARGET',
                       'AGE',
                       'ARPU',
                       'AUTR_FAIL_MCNT',
                       'AVG_CALL_TIME',
                       'AVG_CALL_FREQ',
                       'CPT_LNIF_AMT',
                       'CRDT_CARD',
                       'CRDT_CARD_CNT',
                       'CRDT_OCCR_MDIF',
                       'CRMM_OVDU_AMT',
                       'CTCD_OCCR_MDIF',
                       'HIGH_AMT_RATE',
                       'L_H_RATE',
                       'LNIF_CNT',
                       'LOW_AMT_RATE',
                       'LT1Y_CLOD_RATE',
                       'LT1Y_CTLT_CNT',
                       'LT1Y_MXOD_AMT',
                       'MOBL_PRIN',
                       'MON_TLFE_AMT',
                       'NUM_DAY_SUSP',
                       'OVDU_HIGH_RATE',
                       'PAYM_METD_G',
                       'PREM_OVDU_RATE',
                       'SPTCT_OCCR_MDIF',
                       'TEL_CNTT_QTR',
                       'TEL_OVDU_RATE',
                       'TOT_LNIF_AMT',
                       'TOT_LOAN_CNT',
                       'TOTAL_DELAY_RATE',
                       'CPT_LNIF_BIG',
                       'CPT_LNIF_RATIO',
                       'CRDT_CARD',
                       'CRDT_GRAD_DIFF',
                       'DTI',
                       'FAIL_COUNT',
                       'HIGH_AMT_RATE',
                       'HSHD_INFR_INCM',
                       'LNIF_CNT',
                       'LOW_AMT_RATE',
                       'LINE_STUS_S',
                       'LINE_STUS_U'
]]

In [52]:
cust_data.head()

Unnamed: 0,TARGET,AGE,ARPU,AUTR_FAIL_MCNT,AVG_CALL_TIME,AVG_CALL_FREQ,CPT_LNIF_AMT,CRDT_CARD,CRDT_CARD_CNT,CRDT_OCCR_MDIF,...,CRDT_CARD.1,CRDT_GRAD_DIFF,DTI,FAIL_COUNT,HIGH_AMT_RATE,HSHD_INFR_INCM,LNIF_CNT,LOW_AMT_RATE,LINE_STUS_S,LINE_STUS_U
0,0,50,30000,10,450,493,0,26,2,1,...,26,0,1.667,10,9002.0,7700,1,0.0,0,1
1,0,50,30000,0,81,22,0,242,2,0,...,242,0,4.364,0,1.0,8100,1,0.0,0,1
2,0,60,30000,0,139,17,3001,484,4,1,...,484,0,0.0,0,9001.0,4900,6,0.200053,0,1
3,1,35,30000,0,1118,0,3001,244,4,1,...,244,0,0.0,0,3001.0,10100,8,0.500083,1,0
4,0,45,50000,0,396,354,0,97,1,1,...,97,0,4.375,0,15002.0,4800,4,0.0,0,1


In [53]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

## 연체자 data duplicate

In [54]:
cust = cust_data[cust_data['TARGET'] == 0]
cust_overdue = cust_data[cust_data['TARGET'] == 1]     # 연체자

print(cust.shape[0])
print(cust_overdue.shape[0])

95946
4287


In [55]:
variables = cust_data.drop('TARGET', 1).columns
variables

Index(['AGE', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_TIME', 'AVG_CALL_FREQ',
       'CPT_LNIF_AMT', 'CRDT_CARD', 'CRDT_CARD_CNT', 'CRDT_OCCR_MDIF',
       'CRMM_OVDU_AMT', 'CTCD_OCCR_MDIF', 'HIGH_AMT_RATE', 'L_H_RATE',
       'LNIF_CNT', 'LOW_AMT_RATE', 'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT',
       'LT1Y_MXOD_AMT', 'MOBL_PRIN', 'MON_TLFE_AMT', 'NUM_DAY_SUSP',
       'OVDU_HIGH_RATE', 'PAYM_METD_G', 'PREM_OVDU_RATE', 'SPTCT_OCCR_MDIF',
       'TEL_CNTT_QTR', 'TEL_OVDU_RATE', 'TOT_LNIF_AMT', 'TOT_LOAN_CNT',
       'TOTAL_DELAY_RATE', 'CPT_LNIF_BIG', 'CPT_LNIF_RATIO', 'CRDT_CARD',
       'CRDT_GRAD_DIFF', 'DTI', 'FAIL_COUNT', 'HIGH_AMT_RATE',
       'HSHD_INFR_INCM', 'LNIF_CNT', 'LOW_AMT_RATE', 'LINE_STUS_S',
       'LINE_STUS_U'],
      dtype='object')

### ML modeling

In [56]:
# train / test set 분리
x = cust_data.drop('TARGET', axis=1)
y = cust_data['TARGET']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print('연체자 수 train : {} / {}'.format(sum(y_train == 1), y_train.shape[0]))
print('연체자 수 test  : {} / {}'.format(sum(y_test  == 1), y_test.shape[0]))

연체자 수 train : 3439 / 80186
연체자 수 test  : 848 / 20047


In [57]:
# train set 중에서 연체자만 추출
x_overdue = x_train[y_train == 1]
y_overdue = y_train[y_train == 1]
print(x_overdue.shape)
print(y_overdue.shape)

(3439, 42)
(3439,)


In [58]:
max_epoch = 5

for i in range(max_epoch+1):
    x2_train, x2_test, y2_train, y2_test = train_test_split(x_train, y_train, test_size=0.2, random_state=0)
    
    print('========================================= ', i)
    print('test {} : test overdue {}'.format(y2_test.shape[0], sum(y2_test == 1)))
    print('')
    
#     clf = DecisionTreeClassifier()
    clf = GradientBoostingClassifier(n_estimators=500, random_state=0)
    
    clf.fit(x2_train, y2_train)
    
#     y_pred = clf.predict(x2_test)
#     model_performance(y2_test, y_pred)
    
#     varDic = {'var':variables, 'importance':clf.feature_importances_}
#     importance = pd.DataFrame(varDic)
#     print(importance.sort_values(by='importance', ascending=False).head(20))

    x_train = pd.concat([x_train, x_overdue])
    y_train = pd.concat([y_train, y_overdue])

    print('------------------------')
    y_pred_0 = clf.predict(x_test)
    model_performance(y_test, y_pred_0)
    print('------------------------')


test 16038 : test overdue 720

------------------------
confusion matrix
[[19055   144]
 [  662   186]]
accuracy : 0.96
precision : 0.564
recall : 0.219
F1 : 0.316
------------------------
test 16725 : test overdue 1415

------------------------
confusion matrix
[[18897   302]
 [  558   290]]
accuracy : 0.957
precision : 0.49
recall : 0.342
F1 : 0.403
------------------------
test 17413 : test overdue 2095

------------------------
confusion matrix
[[18726   473]
 [  483   365]]
accuracy : 0.952
precision : 0.436
recall : 0.43
F1 : 0.433
------------------------
test 18101 : test overdue 2740

------------------------
confusion matrix
[[18528   671]
 [  437   411]]
accuracy : 0.945
precision : 0.38
recall : 0.485
F1 : 0.426
------------------------
test 18789 : test overdue 3403

------------------------
confusion matrix
[[18375   824]
 [  396   452]]
accuracy : 0.939
precision : 0.354
recall : 0.533
F1 : 0.426
------------------------
test 19477 : test overdue 4161

------------------