In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# cust_org = pd.read_csv('Dataset.csv')
# cust_org = pd.read_csv('data_kimdahye.csv', encoding='utf-8')
cust_org = pd.read_csv('data_kk.csv', encoding='utf-8')
# cust_org = pd.read_csv('data_0915.csv', encoding='utf-8')
cust = cust_org
cust.shape

(100233, 70)

In [3]:
cust.loc[pd.isnull(cust.LAST_CHLD_AGE), 'LAST_CHLD_AGE'] = 0
cust.loc[cust['OCCP_NAME_G'] == '*', 'OCCP_NAME_G'] = '기타'
cust.loc[cust['MATE_OCCP_NAME_G'] == '*', 'MATE_OCCP_NAME_G'] = '기타'
# cust.loc[cust['AGE'] == '*', 'AGE'] = 20
cust.loc[cust['SEX'] == '*', 'SEX'] = 0

### 대출건수 통합

In [4]:
loan_cnt = pd.Series(cust[['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT']].sum(axis=1), name='LNIF_CNT')
cust = pd.concat([cust, loan_cnt], axis=1)
cust = cust.drop(['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT'], axis=1)
cust.head()

Unnamed: 0,﻿CUST_ID,TARGET,TOT_LNIF_AMT,TOT_CLIF_AMT,BNK_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,...,TEL_CNTT_QTR,NUM_DAY_SUSP,CRMM_OVDU_AMT,TLFE_UNPD_CNT,LT1Y_MXOD_AMT,PAYM_METD,MOBL_PRIN,NEW_PAYM_METD,SUM_MDIF,LNIF_CNT
0,1,0,9001,9001,9001,0,1,0,2,13,...,20111,0,0,0,0,O,580000,0,1.0,1
1,2,0,24001,0,24001,0,0,0,2,121,...,20143,0,0,0,0,O,90000,0,0.0,1
2,3,0,15001,9001,0,3001,1,25,4,121,...,20103,0,0,0,0,O,120000,0,28.5,6
3,4,1,6001,3001,0,3001,1,25,4,61,...,20144,0,540000,0,630000,G,320000,1,28.5,8
4,5,0,21001,15001,21001,0,1,0,1,97,...,20131,0,130000,0,90000,G,410000,1,1.0,4


### 대출금액 통합

In [5]:
# 카드캐피탈대출 금액이 신용+은행대출 금액보다 많은 경우
# 대출총액, 신용대출, 은행대출, 카드캐피탈대출
CPT_LNIF_BIG = pd.Series((cust['CPT_LNIF_AMT'] > cust['TOT_CLIF_AMT'] + cust['BNK_LNIF_AMT']) & (cust['CPT_LNIF_AMT'] > 1),  name='CPT_LNIF_BIG')
CPT_LNIF_RATIO = pd.Series(pd.Series.round(cust['CPT_LNIF_AMT'] / cust['TOT_LNIF_AMT'], 3),  name='CPT_LNIF_RATIO')
cust = pd.concat([cust, CPT_LNIF_BIG, CPT_LNIF_RATIO], axis=1)
cust = cust.drop(['TOT_CLIF_AMT', 'BNK_LNIF_AMT'], axis=1)
cust.head()

Unnamed: 0,﻿CUST_ID,TARGET,TOT_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,CB_GUIF_CNT,CB_GUIF_AMT,...,CRMM_OVDU_AMT,TLFE_UNPD_CNT,LT1Y_MXOD_AMT,PAYM_METD,MOBL_PRIN,NEW_PAYM_METD,SUM_MDIF,LNIF_CNT,CPT_LNIF_BIG,CPT_LNIF_RATIO
0,1,0,9001,0,1,0,2,13,3,420001,...,0,0,0,O,580000,0,1.0,1,False,0.0
1,2,0,24001,0,0,0,2,121,0,0,...,0,0,0,O,90000,0,0.0,1,False,0.0
2,3,0,15001,3001,1,25,4,121,0,0,...,0,0,0,O,120000,0,28.5,6,False,0.2
3,4,1,6001,3001,1,25,4,61,0,0,...,540000,0,630000,G,320000,1,28.5,8,False,0.5
4,5,0,21001,0,1,0,1,97,0,0,...,130000,0,90000,G,410000,1,1.0,4,False,0.0


### DTI : 대출금액/추정소득

In [6]:
dti = pd.Series(np.round(cust['TOT_LNIF_AMT'] / cust['CUST_JOB_INCM'], 3))
cust['DTI'] = dti
cust = cust.replace(np.inf, np.nan)
cust = cust.fillna(value=0) 
cust.head()

Unnamed: 0,﻿CUST_ID,TARGET,TOT_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,CB_GUIF_CNT,CB_GUIF_AMT,...,TLFE_UNPD_CNT,LT1Y_MXOD_AMT,PAYM_METD,MOBL_PRIN,NEW_PAYM_METD,SUM_MDIF,LNIF_CNT,CPT_LNIF_BIG,CPT_LNIF_RATIO,DTI
0,1,0,9001,0,1,0,2,13,3,420001,...,0,0,O,580000,0,1.0,1,False,0.0,1.667
1,2,0,24001,0,0,0,2,121,0,0,...,0,0,O,90000,0,0.0,1,False,0.0,4.364
2,3,0,15001,3001,1,25,4,121,0,0,...,0,0,O,120000,0,28.5,6,False,0.2,0.0
3,4,1,6001,3001,1,25,4,61,0,0,...,0,630000,G,320000,1,28.5,8,False,0.5,0.0
4,5,0,21001,0,1,0,1,97,0,0,...,0,90000,G,410000,1,1.0,4,False,0.0,4.375


In [7]:
# 연속형변수
conti_var = cust.columns[cust.dtypes != 'object'].difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT', 'CB_GUIF_CNT',
       'CNTT_LAMT_CNT', 'CPT_LNIF_AMT', 'CPT_LNIF_BIG', 'CPT_LNIF_RATIO',
       'CRDT_CARD_CNT', 'CRDT_LOAN_CNT', 'CRDT_OCCR_MDIF', 'CRLN_30OVDU_RATE',
       'CRLN_OVDU_RATE', 'CRMM_OVDU_AMT', 'CTCD_OCCR_MDIF', 'CUST_FMLY_NUM',
       'CUST_JOB_INCM', 'DTI', 'FMLY_CLAM_CNT', 'FMLY_GDINS_MNPREM',
       'FMLY_PLPY_CNT', 'FMLY_SVINS_MNPREM', 'FMLY_TOT_PREM', 'FYCM_PAID_AMT',
       'GDINS_MON_PREM', 'HSHD_INFR_INCM', 'LAST_CHLD_AGE', 'LNIF_CNT',
       'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT', 'LT1Y_MXOD_AMT', 'LT1Y_SLOD_RATE',
       'LT1Y_STLN_AMT', 'LTST_CRDT_GRAD', 'MATE_JOB_INCM', 'MAX_MON_PREM',
       'MIN_CNTT_DATE', 'MOBL_FATY_PRC', 'MOBL_PRIN', 'MON_TLFE_AMT',
       'NEW_PAYM_METD', 'NUM_DAY_SUSP', 'PREM_OVDU_RATE', 'SPTCT_OCCR_MDIF',
       'STLN_REMN_AMT', 'STRT_CRDT_GRAD', 'SUM_MDIF', 'SVINS_MON_PREM',
       'TARGET', 'TEL_CNTT_QTR', 'TLFE_UN

In [8]:
# 범주형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['OCCP_NAME_G', 'MATE_OCCP_NAME_G', 'LT1Y_PEOD_RATE', 'AGE', 'SEX',
       'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD'],
      dtype='object')

In [9]:
# 범주형 변수를 dummy 변수로 변환
dummy_var = pd.get_dummies(cust[cate_var])
dummy_var.head()

Unnamed: 0,OCCP_NAME_G_0,OCCP_NAME_G_1차산업 종사자,OCCP_NAME_G_2차산업 종사자,OCCP_NAME_G_3차산업 종사자,OCCP_NAME_G_고소득 전문직,OCCP_NAME_G_공무원,OCCP_NAME_G_기업/단체 임원,OCCP_NAME_G_기타,OCCP_NAME_G_단순 노무직,OCCP_NAME_G_단순 사무직,...,TEL_MBSP_GRAD_Q,TEL_MBSP_GRAD_R,TEL_MBSP_GRAD_W,CBPT_MBSP_YN_N,CBPT_MBSP_YN_Y,PAYM_METD_0,PAYM_METD_G,PAYM_METD_K,PAYM_METD_O,PAYM_METD_R
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [10]:
x_data = pd.concat([cust[conti_var], dummy_var], axis=1)
x_data.head()

Unnamed: 0,ACTL_FMLY_NUM,ARPU,AUTR_FAIL_MCNT,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,CB_GUIF_AMT,CB_GUIF_CNT,CNTT_LAMT_CNT,CPT_LNIF_AMT,...,TEL_MBSP_GRAD_Q,TEL_MBSP_GRAD_R,TEL_MBSP_GRAD_W,CBPT_MBSP_YN_N,CBPT_MBSP_YN_Y,PAYM_METD_0,PAYM_METD_G,PAYM_METD_K,PAYM_METD_O,PAYM_METD_R
0,4,30000,10,493,450,0,420001,3,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4,30000,0,22,81,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4,30000,0,17,139,0,0,0,0,3001,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2,30000,0,0,1118,0,0,0,0,3001,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,50000,0,354,396,95,0,0,0,0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [11]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

## 연체자 data duplicate

In [12]:
cust = x_data[x_data['TARGET'] == 0]
cust_overdue = x_data[x_data['TARGET'] == 1]     # 연체자

print(cust.shape[0])
print(cust_overdue.shape[0])

95946
4287


In [13]:
x_data2 = x_data.drop('TARGET', 1)
variables = x_data2.columns
variables

Index(['ACTL_FMLY_NUM', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT', 'CB_GUIF_CNT',
       'CNTT_LAMT_CNT', 'CPT_LNIF_AMT',
       ...
       'TEL_MBSP_GRAD_Q', 'TEL_MBSP_GRAD_R', 'TEL_MBSP_GRAD_W',
       'CBPT_MBSP_YN_N', 'CBPT_MBSP_YN_Y', 'PAYM_METD_0', 'PAYM_METD_G',
       'PAYM_METD_K', 'PAYM_METD_O', 'PAYM_METD_R'],
      dtype='object', length=129)

In [None]:
o_data = pd.DataFrame()

for i in range(10):
    # 연체자 수를 점차 늘려간다.
    o_data = pd.concat([o_data, cust_overdue])
    
    x_data = pd.concat([cust, o_data])
    x = x_data.drop('TARGET', axis=1)
    y = x_data['TARGET']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
      
    print('---------------------------------------- ', i)
    print('Total {} : overdue           {}'.format(x_data.shape[0], x_data[x_data['TARGET'] == 1].shape[0]))
    print('test  {} : overdue in test   {}'.format(x_test.shape[0], len(y_test[y_test == 1])))
    print('')
    
    tree = DecisionTreeClassifier()
    tree.fit(x_train, y_train)
    y_pred = tree.predict(x_test)
    model_performance(y_test, y_pred)
    
    varDic = {'var':variables, 'importance':tree.feature_importances_}
    importance = pd.DataFrame(varDic)
    print(importance.sort_values(by='importance', ascending=False).head(10))

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

In [34]:
gb = GradientBoostingClassifier(n_estimators=500, random_state=0)
gb.fit(x_train, y_train)
y_pred = gb.predict(x_test)
model_performance(y_test, y_pred)

confusion matrix
[[19064   100]
 [  661   222]]
accuracy : 0.962
precision : 0.689
recall : 0.251
F1 : 0.368


In [15]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel='linear')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
model_performance(y_test, y_pred)