In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
%matplotlib inline

In [52]:
cust = pd.read_csv('Dataset.csv')
cust.shape

(100233, 69)

In [53]:
cust.loc[pd.isnull(cust.LAST_CHLD_AGE), 'LAST_CHLD_AGE'] = 0
cust.loc[cust['OCCP_NAME_G'] == '*', 'OCCP_NAME_G'] = '기타'
cust.loc[cust['MATE_OCCP_NAME_G'] == '*', 'MATE_OCCP_NAME_G'] = '기타'
cust.loc[cust['AGE'] == '*', 'AGE'] = 20
cust.loc[cust['SEX'] == '*', 'SEX'] = 0

### 대출건수 통합

In [54]:
loan_cnt = pd.Series(cust[['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT']].sum(axis=1), name='LNIF_CNT')
cust = pd.concat([cust, loan_cnt], axis=1)
cust = cust.drop(['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT'], axis=1)
cust.head()

Unnamed: 0,CUST_ID,TARGET,TOT_LNIF_AMT,TOT_CLIF_AMT,BNK_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,...,MOBL_FATY_PRC,TEL_CNTT_QTR,NUM_DAY_SUSP,CRMM_OVDU_AMT,TLFE_UNPD_CNT,LT1Y_MXOD_AMT,PAYM_METD,LINE_STUS,MOBL_PRIN,LNIF_CNT
0,1,0,9001,9001,9001,0,1,0,2,13,...,800000,20111,0,0,0,0,O,U,580000,1
1,2,0,24001,0,24001,0,0,0,2,121,...,500000,20143,0,0,0,0,O,U,90000,1
2,3,0,15001,9001,0,3001,1,25,4,121,...,500000,20103,0,0,0,0,O,U,120000,6
3,4,1,6001,3001,0,3001,1,25,4,61,...,900000,20144,0,540000,0,630000,G,S,320000,8
4,5,0,21001,15001,21001,0,1,0,1,97,...,800000,20131,0,130000,0,90000,G,U,410000,4


### 대출금액 통합

In [55]:
# 카드캐피탈대출 금액이 신용+은행대출 금액보다 많은 경우
# 대출총액, 신용대출, 은행대출, 카드캐피탈대출

CPT_LNIF_BIG = pd.Series((cust['CPT_LNIF_AMT'] > cust['TOT_CLIF_AMT'] + cust['BNK_LNIF_AMT']) & (cust['CPT_LNIF_AMT'] > 1),  name='CPT_LNIF_BIG')
CPT_LNIF_RATIO = pd.Series(pd.Series.round(cust['CPT_LNIF_AMT'] / cust['TOT_LNIF_AMT'], 3),  name='CPT_LNIF_RATIO')

cust = pd.concat([cust, CPT_LNIF_BIG, CPT_LNIF_RATIO], axis=1)
# cust = cust.drop(['TOT_CLIF_AMT', 'BNK_LNIF_AMT'], axis=1)

cust.head(10)

Unnamed: 0,CUST_ID,TARGET,TOT_LNIF_AMT,TOT_CLIF_AMT,BNK_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,...,NUM_DAY_SUSP,CRMM_OVDU_AMT,TLFE_UNPD_CNT,LT1Y_MXOD_AMT,PAYM_METD,LINE_STUS,MOBL_PRIN,LNIF_CNT,CPT_LNIF_BIG,CPT_LNIF_RATIO
0,1,0,9001,9001,9001,0,1,0,2,13,...,0,0,0,0,O,U,580000,1,False,0.0
1,2,0,24001,0,24001,0,0,0,2,121,...,0,0,0,0,O,U,90000,1,False,0.0
2,3,0,15001,9001,0,3001,1,25,4,121,...,0,0,0,0,O,U,120000,6,False,0.2
3,4,1,6001,3001,0,3001,1,25,4,61,...,0,540000,0,630000,G,S,320000,8,False,0.5
4,5,0,21001,15001,21001,0,1,0,1,97,...,0,130000,0,90000,G,U,410000,4,False,0.0
5,6,0,141001,27001,111001,0,1,1,4,121,...,0,0,0,0,O,U,170000,3,False,0.0
6,7,0,12001,3001,0,9001,121,121,2,121,...,0,120000,0,290000,G,U,720000,4,True,0.75
7,8,0,3001,3001,3001,0,1,0,2,121,...,0,0,0,0,O,U,40000,1,False,0.0
8,9,0,273001,273001,273001,0,37,0,5,121,...,0,0,0,0,O,U,0,2,False,0.0
9,10,0,9001,9001,0,9001,1,1,3,25,...,0,0,0,0,O,U,0,2,False,1.0


### DTI : 대출금액/추정소득

In [59]:
cust[['CUST_JOB_INCM', 'TOT_LNIF_AMT']].head()

Unnamed: 0,CUST_JOB_INCM,TOT_LNIF_AMT
0,5400,9001
1,5500,24001
2,0,15001
3,0,6001
4,4800,21001


In [58]:
cust[cust['TARGET'] == 1]

Unnamed: 0,CUST_ID,TARGET,TOT_LNIF_AMT,TOT_CLIF_AMT,BNK_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,...,NUM_DAY_SUSP,CRMM_OVDU_AMT,TLFE_UNPD_CNT,LT1Y_MXOD_AMT,PAYM_METD,LINE_STUS,MOBL_PRIN,LNIF_CNT,CPT_LNIF_BIG,CPT_LNIF_RATIO
3,4,1,6001,3001,0,3001,1,25,4,61,...,0,540000,0,630000,G,S,320000,8,False,0.500
10,11,1,27001,27001,0,18001,1,25,4,61,...,0,0,0,0,R,U,470000,10,False,0.667
36,37,1,3001,3001,0,1,37,37,1,49,...,0,120000,0,490000,G,U,460000,4,False,0.000
46,47,1,3001,3001,0,1,121,121,1,121,...,0,50000,0,0,O,U,0,4,False,0.000
75,76,1,1,1,0,1,49,49,1,121,...,8,0,0,0,G,U,110000,4,False,1.000
96,97,1,18001,18001,0,0,13,37,1,109,...,0,0,0,0,G,U,40000,4,False,0.000
98,99,1,66001,60001,12001,15001,1,61,3,121,...,2,0,0,40000,O,U,30000,11,False,0.227
137,139,1,15001,15001,15001,0,61,0,0,0,...,35,200000,0,200000,R,U,0,1,False,0.000
149,151,1,15001,9001,3001,12001,1,1,4,73,...,0,220000,0,110000,G,U,600000,7,False,0.800
177,179,1,1,1,1,1,37,37,0,0,...,0,0,0,0,O,U,0,3,False,1.000


In [38]:
# 연속형변수
conti_var = cust.columns[cust.dtypes != 'object'].difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'BNK_LNIF_AMT', 'CB_GUIF_AMT',
       'CB_GUIF_CNT', 'CNTT_LAMT_CNT', 'CPT_LNIF_AMT', 'CPT_LNIF_BIG',
       'CPT_LNIF_RATIO', 'CRDT_CARD_CNT', 'CRDT_LOAN_CNT', 'CRDT_OCCR_MDIF',
       'CRLN_30OVDU_RATE', 'CRLN_OVDU_RATE', 'CRMM_OVDU_AMT', 'CTCD_OCCR_MDIF',
       'CUST_FMLY_NUM', 'CUST_JOB_INCM', 'FMLY_CLAM_CNT', 'FMLY_GDINS_MNPREM',
       'FMLY_PLPY_CNT', 'FMLY_SVINS_MNPREM', 'FMLY_TOT_PREM', 'FYCM_PAID_AMT',
       'GDINS_MON_PREM', 'HSHD_INFR_INCM', 'LAST_CHLD_AGE', 'LNIF_CNT',
       'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT', 'LT1Y_MXOD_AMT', 'LT1Y_SLOD_RATE',
       'LT1Y_STLN_AMT', 'LTST_CRDT_GRAD', 'MATE_JOB_INCM', 'MAX_MON_PREM',
       'MIN_CNTT_DATE', 'MOBL_FATY_PRC', 'MOBL_PRIN', 'MON_TLFE_AMT',
       'NUM_DAY_SUSP', 'PREM_OVDU_RATE', 'SPTCT_OCCR_MDIF', 'STLN_REMN_AMT',
       'STRT_CRDT_GRAD', 'SVINS_MON_PREM', 'TARGET', 'TEL_CNTT_QTR',
       'TLFE_UNPD_CNT', 'TOT_CLIF_A

In [39]:
# 범주형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['OCCP_NAME_G', 'MATE_OCCP_NAME_G', 'LT1Y_PEOD_RATE', 'AGE', 'SEX',
       'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD', 'LINE_STUS'],
      dtype='object')

In [40]:
# 범주형 변수를 dummy 변수로 변환
dummy_var = pd.get_dummies(cust[cate_var])

In [41]:
x_data = pd.concat([cust[conti_var], dummy_var], axis=1)
x_data.head()

Unnamed: 0,ACTL_FMLY_NUM,ARPU,AUTR_FAIL_MCNT,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,BNK_LNIF_AMT,CB_GUIF_AMT,CB_GUIF_CNT,CNTT_LAMT_CNT,...,TEL_MBSP_GRAD_R,TEL_MBSP_GRAD_W,CBPT_MBSP_YN_N,CBPT_MBSP_YN_Y,PAYM_METD_G,PAYM_METD_K,PAYM_METD_O,PAYM_METD_R,LINE_STUS_S,LINE_STUS_U
0,4,30000,10,493,450,0,9001,420001,3,0,...,0,0,1,0,0,0,1,0,0,1
1,4,30000,0,22,81,0,24001,0,0,0,...,0,0,1,0,0,0,1,0,0,1
2,4,30000,0,17,139,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
3,2,30000,0,0,1118,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
4,4,50000,0,354,396,95,21001,0,0,0,...,0,1,0,1,1,0,0,0,0,1


In [42]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

## 연체자 data duplicate

In [43]:
cust = x_data[x_data['TARGET'] == 0]
cust_overdue = x_data[x_data['TARGET'] == 1]     # 연체자

print(cust.shape[0])
print(cust_overdue.shape[0])

95946
4287


In [44]:
o_data = pd.DataFrame()

for i in range(20):
    # 연체자 수를 점차 늘려간다.
    o_data = pd.concat([o_data, cust_overdue])
    
    x_data = pd.concat([cust, o_data])
    x = x_data.drop('TARGET', axis=1)
    y = x_data['TARGET']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
      
    print('---------------------------------------- ', i)
    print('Total {} : overdue           {}'.format(x_data.shape[0], x_data[x_data['TARGET'] == 1].shape[0]))
    print('test  {} : overdue in test   {}'.format(x_test.shape[0], len(y_test[y_test == 1])))
    print('')
    
    tree = DecisionTreeClassifier()
    tree.fit(x_train, y_train)
    y_pred = tree.predict(x_test)
    model_performance(y_test, y_pred)

----------------------------------------  0
Total 100233 : overdue           4287
test  20047 : overdue in test   883

confusion matrix
[[18486   678]
 [  615   268]]
accuracy : 0.936
precision : 0.283
recall : 0.304
F1 : 0.293
----------------------------------------  1
Total 104520 : overdue           8574
test  20904 : overdue in test   1730

confusion matrix
[[18410   764]
 [  242  1488]]
accuracy : 0.952
precision : 0.661
recall : 0.86
F1 : 0.747
----------------------------------------  2
Total 108807 : overdue           12861
test  21762 : overdue in test   2545

confusion matrix
[[18392   825]
 [   90  2455]]
accuracy : 0.958
precision : 0.748
recall : 0.965
F1 : 0.843
----------------------------------------  3
Total 113094 : overdue           17148
test  22619 : overdue in test   3448

confusion matrix
[[18393   778]
 [   24  3424]]
accuracy : 0.965
precision : 0.815
recall : 0.993
F1 : 0.895
----------------------------------------  4
Total 117381 : overdue           21435
t