In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
cust = pd.read_csv('Dataset.csv')
cust.shape

(100233, 69)

In [4]:
cust.head()

Unnamed: 0,CUST_ID,TARGET,BNK_LNIF_CNT,CPT_LNIF_CNT,SPART_LNIF_CNT,ECT_LNIF_CNT,TOT_LNIF_AMT,TOT_CLIF_AMT,BNK_LNIF_AMT,CPT_LNIF_AMT,...,CBPT_MBSP_YN,MOBL_FATY_PRC,TEL_CNTT_QTR,NUM_DAY_SUSP,CRMM_OVDU_AMT,TLFE_UNPD_CNT,LT1Y_MXOD_AMT,PAYM_METD,LINE_STUS,MOBL_PRIN
0,1,0,1,0,0,0,9001,9001,9001,0,...,N,800000,20111,0,0,0,0,O,U,580000
1,2,0,1,0,0,0,24001,0,24001,0,...,N,500000,20143,0,0,0,0,O,U,90000
2,3,0,0,1,3,2,15001,9001,0,3001,...,Y,500000,20103,0,0,0,0,O,U,120000
3,4,1,0,2,4,2,6001,3001,0,3001,...,N,900000,20144,0,540000,0,630000,G,S,320000
4,5,0,4,0,0,0,21001,15001,21001,0,...,Y,800000,20131,0,130000,0,90000,G,U,410000


In [5]:
cust.loc[pd.isnull(cust.LAST_CHLD_AGE), 'LAST_CHLD_AGE'] = 0

In [6]:
cust.loc[cust['OCCP_NAME_G'] == '*', 'OCCP_NAME_G'] = '기타'
cust.loc[cust['MATE_OCCP_NAME_G'] == '*', 'MATE_OCCP_NAME_G'] = '기타'
cust.loc[cust['AGE'] == '*', 'AGE'] = 20
cust.loc[cust['SEX'] == '*', 'SEX'] = 0

In [7]:
# 연속형변수
conti_var = cust.columns[cust.dtypes != 'object'].difference(['TARGET']).difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'BNK_LNIF_AMT', 'BNK_LNIF_CNT',
       'CB_GUIF_AMT', 'CB_GUIF_CNT', 'CNTT_LAMT_CNT', 'CPT_LNIF_AMT',
       'CPT_LNIF_CNT', 'CRDT_CARD_CNT', 'CRDT_LOAN_CNT', 'CRDT_OCCR_MDIF',
       'CRLN_30OVDU_RATE', 'CRLN_OVDU_RATE', 'CRMM_OVDU_AMT', 'CTCD_OCCR_MDIF',
       'CUST_FMLY_NUM', 'CUST_JOB_INCM', 'ECT_LNIF_CNT', 'FMLY_CLAM_CNT',
       'FMLY_GDINS_MNPREM', 'FMLY_PLPY_CNT', 'FMLY_SVINS_MNPREM',
       'FMLY_TOT_PREM', 'FYCM_PAID_AMT', 'GDINS_MON_PREM', 'HSHD_INFR_INCM',
       'LAST_CHLD_AGE', 'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT', 'LT1Y_MXOD_AMT',
       'LT1Y_SLOD_RATE', 'LT1Y_STLN_AMT', 'LTST_CRDT_GRAD', 'MATE_JOB_INCM',
       'MAX_MON_PREM', 'MIN_CNTT_DATE', 'MOBL_FATY_PRC', 'MOBL_PRIN',
       'MON_TLFE_AMT', 'NUM_DAY_SUSP', 'PREM_OVDU_RATE', 'SPART_LNIF_CNT',
       'SPTCT_OCCR_MDIF', 'STLN_REMN_AMT', 'STRT_CRDT_GRAD', 'SVINS_MON_PREM',
       'TEL_CNTT_QTR', 'TLFE_UNPD_CNT', '

In [8]:
# 범주형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['OCCP_NAME_G', 'MATE_OCCP_NAME_G', 'LT1Y_PEOD_RATE', 'AGE', 'SEX',
       'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD', 'LINE_STUS'],
      dtype='object')

In [9]:
cust_temp = cust[['CUST_ID', 'OCCP_NAME_G']]
cust_temp.groupby(['OCCP_NAME_G']).count()

Unnamed: 0_level_0,CUST_ID
OCCP_NAME_G,Unnamed: 1_level_1
1차산업 종사자,1178
2차산업 종사자,9601
3차산업 종사자,8275
고소득 전문직,1223
공무원,5091
기업/단체 임원,1041
기타,2861
단순 노무직,821
단순 사무직,4107
사무직,16581


In [10]:
# 범주형 변수를 dummy 변수로 변환
dummy_var = pd.get_dummies(cust[cate_var])

In [11]:
x = pd.concat([cust[conti_var], dummy_var], axis=1)
y = cust['TARGET']

In [12]:
cust[cust['TARGET'] == 1].shape

(4287, 69)

In [13]:
cust[cust['TARGET'] == 0].shape

(95946, 69)

In [14]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_train.shape

(80186, 126)

In [16]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)
model_performance(y_test, y_pred)

confusion matrix
[[18510   689]
 [  603   245]]
accuracy : 0.936
precision : 0.262
recall : 0.289
F1 : 0.275


In [18]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
model_performance(y_test, y_pred)

confusion matrix
[[19182    17]
 [  792    56]]
accuracy : 0.96
precision : 0.767
recall : 0.066
F1 : 0.122
