In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
cust = pd.read_csv('Dataset.csv')
cust.shape

(100233, 69)

In [4]:
cust.loc[pd.isnull(cust.LAST_CHLD_AGE), 'LAST_CHLD_AGE'] = 0
cust.loc[cust['OCCP_NAME_G'] == '*', 'OCCP_NAME_G'] = '기타'
cust.loc[cust['MATE_OCCP_NAME_G'] == '*', 'MATE_OCCP_NAME_G'] = '기타'
cust.loc[cust['AGE'] == '*', 'AGE'] = 20
cust.loc[cust['SEX'] == '*', 'SEX'] = 0

In [5]:
# 연속형변수
conti_var = cust.columns[cust.dtypes != 'object'].difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'BNK_LNIF_AMT', 'BNK_LNIF_CNT',
       'CB_GUIF_AMT', 'CB_GUIF_CNT', 'CNTT_LAMT_CNT', 'CPT_LNIF_AMT',
       'CPT_LNIF_CNT', 'CRDT_CARD_CNT', 'CRDT_LOAN_CNT', 'CRDT_OCCR_MDIF',
       'CRLN_30OVDU_RATE', 'CRLN_OVDU_RATE', 'CRMM_OVDU_AMT', 'CTCD_OCCR_MDIF',
       'CUST_FMLY_NUM', 'CUST_JOB_INCM', 'ECT_LNIF_CNT', 'FMLY_CLAM_CNT',
       'FMLY_GDINS_MNPREM', 'FMLY_PLPY_CNT', 'FMLY_SVINS_MNPREM',
       'FMLY_TOT_PREM', 'FYCM_PAID_AMT', 'GDINS_MON_PREM', 'HSHD_INFR_INCM',
       'LAST_CHLD_AGE', 'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT', 'LT1Y_MXOD_AMT',
       'LT1Y_SLOD_RATE', 'LT1Y_STLN_AMT', 'LTST_CRDT_GRAD', 'MATE_JOB_INCM',
       'MAX_MON_PREM', 'MIN_CNTT_DATE', 'MOBL_FATY_PRC', 'MOBL_PRIN',
       'MON_TLFE_AMT', 'NUM_DAY_SUSP', 'PREM_OVDU_RATE', 'SPART_LNIF_CNT',
       'SPTCT_OCCR_MDIF', 'STLN_REMN_AMT', 'STRT_CRDT_GRAD', 'SVINS_MON_PREM',
       'TARGET', 'TEL_CNTT_QTR', 'TLFE_UN

In [6]:
# 범주형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['OCCP_NAME_G', 'MATE_OCCP_NAME_G', 'LT1Y_PEOD_RATE', 'AGE', 'SEX',
       'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD', 'LINE_STUS'],
      dtype='object')

In [7]:
# 범주형 변수를 dummy 변수로 변환
dummy_var = pd.get_dummies(cust[cate_var])

In [8]:
x_data = pd.concat([cust[conti_var], dummy_var], axis=1)
x_data.head()

Unnamed: 0,ACTL_FMLY_NUM,ARPU,AUTR_FAIL_MCNT,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,BNK_LNIF_AMT,BNK_LNIF_CNT,CB_GUIF_AMT,CB_GUIF_CNT,...,TEL_MBSP_GRAD_R,TEL_MBSP_GRAD_W,CBPT_MBSP_YN_N,CBPT_MBSP_YN_Y,PAYM_METD_G,PAYM_METD_K,PAYM_METD_O,PAYM_METD_R,LINE_STUS_S,LINE_STUS_U
0,4,30000,10,493,450,0,9001,1,420001,3,...,0,0,1,0,0,0,1,0,0,1
1,4,30000,0,22,81,0,24001,1,0,0,...,0,0,1,0,0,0,1,0,0,1
2,4,30000,0,17,139,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
3,2,30000,0,0,1118,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
4,4,50000,0,354,396,95,21001,4,0,0,...,0,1,0,1,1,0,0,0,0,1


In [9]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

## 연체자 data duplicate

In [19]:
cust = x_data[x_data['TARGET'] == 0]
cust_overdue = x_data[x_data['TARGET'] == 1]     # 연체자

print(cust.shape[0])
print(cust_overdue.shape[0])

95946
4287


In [27]:
o_data = pd.DataFrame()

for i in range(20):
    # 연체자 수를 점차 늘려간다.
    o_data = pd.concat([o_data, cust_overdue])
    
    x_data = pd.concat([cust, o_data])
    x = x_data.drop('TARGET', axis=1)
    y = x_data['TARGET']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
      
    print('---------------------------------------- ', i)
    print('Total {} : overdue           {}'.format(x_data.shape[0], x_data[x_data['TARGET'] == 1].shape[0]))
    print('test  {} : overdue in test   {}'.format(x_test.shape[0], len(y_test[y_test == 1])))
    print('')
    
    tree = DecisionTreeClassifier()
    tree.fit(x_train, y_train)
    y_pred = tree.predict(x_test)
    model_performance(y_test, y_pred)

----------------------------------------  0
Total 100233 : overdue           4287
test  20047 : overdue in test   883

confusion matrix
[[18498   666]
 [  636   247]]
accuracy : 0.935
precision : 0.271
recall : 0.28
F1 : 0.275
----------------------------------------  1
Total 104520 : overdue           8574
test  20904 : overdue in test   1730

confusion matrix
[[18382   792]
 [  256  1474]]
accuracy : 0.95
precision : 0.65
recall : 0.852
F1 : 0.738
----------------------------------------  2
Total 108807 : overdue           12861
test  21762 : overdue in test   2545

confusion matrix
[[18450   767]
 [   84  2461]]
accuracy : 0.961
precision : 0.762
recall : 0.967
F1 : 0.853
----------------------------------------  3
Total 113094 : overdue           17148
test  22619 : overdue in test   3448

confusion matrix
[[18383   788]
 [   28  3420]]
accuracy : 0.964
precision : 0.813
recall : 0.992
F1 : 0.893
----------------------------------------  4
Total 117381 : overdue           21435
tes