In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [67]:
cust = pd.read_csv('data_0915_u.csv', encoding='utf-8')
cust.shape

(100233, 79)

In [68]:
cust.loc[pd.isnull(cust.LAST_CHLD_AGE), 'LAST_CHLD_AGE'] = 0
cust.loc[cust['OCCP_NAME_G'] == '*', 'OCCP_NAME_G'] = '기타'
cust.loc[pd.isnull(cust.MATE_OCCP_NAME_G), 'MATE_OCCP_NAME_G'] = '기타'
# cust.loc[cust['MATE_OCCP_NAME_G'] == '*', 'MATE_OCCP_NAME_G'] = '기타'
# cust.loc[cust['AGE'] == '*', 'AGE'] = 20
cust.loc[cust['SEX'] == '*', 'SEX'] = 0

### 대출건수 통합

In [69]:
loan_cnt = pd.Series(cust[['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT']].sum(axis=1), name='LNIF_CNT')
cust = pd.concat([cust, loan_cnt], axis=1)
cust = cust.drop(['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT'], axis=1)
cust.head()

Unnamed: 0,CUST_ID,TARGET,TOT_LNIF_AMT,TOT_CLIF_AMT,BNK_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,...,LOW_AMT_RATE,L_H_RATE,CRDT_CARD,CRDT_GRAD_DIFF,TOTAL_DELAY_RATE,TEL_OVDU_RATE,OVDU_HIGH_RATE,FAIL_COUNT,TOT_LOAN_CNT,LNIF_CNT
0,1,0,9001,9001,9001,0,1,0,2,13,...,0.0,,26,0,12,0.0,0.0,10,-0.4,1
1,2,0,24001,0,24001,0,0,0,2,121,...,0.0,,242,0,13,0.0,0.0,0,-0.4,1
2,3,0,15001,9001,0,3001,1,25,4,121,...,0.200053,0.000333,484,0,2,0.0,0.0,0,1.39,6
3,4,1,6001,3001,0,3001,1,25,4,61,...,0.500083,0.000666,244,0,4,6.75,7.875,0,1.8,8
4,5,0,21001,15001,21001,0,1,0,1,97,...,0.0,,97,0,0,1.625,1.125,0,-1.6,4


### 대출금액 통합

In [70]:
# 카드캐피탈대출 금액이 신용+은행대출 금액보다 많은 경우
# 대출총액, 신용대출, 은행대출, 카드캐피탈대출
CPT_LNIF_BIG = pd.Series((cust['CPT_LNIF_AMT'] > cust['TOT_CLIF_AMT'] + cust['BNK_LNIF_AMT']) & (cust['CPT_LNIF_AMT'] > 1),  name='CPT_LNIF_BIG')
CPT_LNIF_RATIO = pd.Series(pd.Series.round(cust['CPT_LNIF_AMT'] / cust['TOT_LNIF_AMT'], 3),  name='CPT_LNIF_RATIO')
cust = pd.concat([cust, CPT_LNIF_BIG, CPT_LNIF_RATIO], axis=1)
cust = cust.drop(['TOT_CLIF_AMT', 'BNK_LNIF_AMT'], axis=1)
cust.head()

Unnamed: 0,CUST_ID,TARGET,TOT_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,CB_GUIF_CNT,CB_GUIF_AMT,...,CRDT_CARD,CRDT_GRAD_DIFF,TOTAL_DELAY_RATE,TEL_OVDU_RATE,OVDU_HIGH_RATE,FAIL_COUNT,TOT_LOAN_CNT,LNIF_CNT,CPT_LNIF_BIG,CPT_LNIF_RATIO
0,1,0,9001,0,1,0,2,13,3,420001,...,26,0,12,0.0,0.0,10,-0.4,1,False,0.0
1,2,0,24001,0,0,0,2,121,0,0,...,242,0,13,0.0,0.0,0,-0.4,1,False,0.0
2,3,0,15001,3001,1,25,4,121,0,0,...,484,0,2,0.0,0.0,0,1.39,6,False,0.2
3,4,1,6001,3001,1,25,4,61,0,0,...,244,0,4,6.75,7.875,0,1.8,8,False,0.5
4,5,0,21001,0,1,0,1,97,0,0,...,97,0,0,1.625,1.125,0,-1.6,4,False,0.0


### DTI : 대출금액/추정소득

In [71]:
dti = pd.Series(np.round(cust['TOT_LNIF_AMT'] / cust['CUST_JOB_INCM'], 3))
cust['DTI'] = dti
cust = cust.replace(np.inf, np.nan)
cust = cust.fillna(value=0) 
cust.head()

Unnamed: 0,CUST_ID,TARGET,TOT_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,CB_GUIF_CNT,CB_GUIF_AMT,...,CRDT_GRAD_DIFF,TOTAL_DELAY_RATE,TEL_OVDU_RATE,OVDU_HIGH_RATE,FAIL_COUNT,TOT_LOAN_CNT,LNIF_CNT,CPT_LNIF_BIG,CPT_LNIF_RATIO,DTI
0,1,0,9001,0,1,0,2,13,3,420001,...,0,12,0.0,0.0,10,-0.4,1,False,0.0,1.667
1,2,0,24001,0,0,0,2,121,0,0,...,0,13,0.0,0.0,0,-0.4,1,False,0.0,4.364
2,3,0,15001,3001,1,25,4,121,0,0,...,0,2,0.0,0.0,0,1.39,6,False,0.2,0.0
3,4,1,6001,3001,1,25,4,61,0,0,...,0,4,6.75,7.875,0,1.8,8,False,0.5,0.0
4,5,0,21001,0,1,0,1,97,0,0,...,0,0,1.625,1.125,0,-1.6,4,False,0.0,4.375


In [72]:
# 연속형변수
conti_var = cust.columns[cust.dtypes != 'object'].difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'AGE', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT', 'CB_GUIF_CNT',
       'CNTT_LAMT_CNT', 'CPT_LNIF_AMT', 'CPT_LNIF_BIG', 'CPT_LNIF_RATIO',
       'CRDT_CARD', 'CRDT_CARD_CNT', 'CRDT_GRAD_DIFF', 'CRDT_LOAN_CNT',
       'CRDT_OCCR_MDIF', 'CRLN_30OVDU_RATE', 'CRLN_OVDU_RATE', 'CRMM_OVDU_AMT',
       'CTCD_OCCR_MDIF', 'CUST_FMLY_NUM', 'CUST_JOB_INCM', 'DTI', 'FAIL_COUNT',
       'FMLY_CLAM_CNT', 'FMLY_GDINS_MNPREM', 'FMLY_PLPY_CNT',
       'FMLY_SVINS_MNPREM', 'FMLY_TOT_PREM', 'FYCM_PAID_AMT', 'GDINS_MON_PREM',
       'HIGH_AMT_RATE', 'HSHD_INFR_INCM', 'LAST_CHLD_AGE', 'LNIF_CNT',
       'LOW_AMT_RATE', 'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT', 'LT1Y_MXOD_AMT',
       'LT1Y_SLOD_RATE', 'LT1Y_STLN_AMT', 'LTST_CRDT_GRAD', 'L_H_RATE',
       'MATE_JOB_INCM', 'MAX_MON_PREM', 'MIN_CNTT_DATE', 'MOBL_FATY_PRC',
       'MOBL_PRIN', 'MON_TLFE_AMT', 'NUM_DAY_SUSP', 'OVDU_HIGH_RATE',
       'PREM_OVDU_RATE', 'SPTCT_OCCR_MDIF', '

In [73]:
# 범주형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['OCCP_NAME_G', 'MATE_OCCP_NAME_G', 'LT1Y_PEOD_RATE', 'SEX',
       'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD', 'LINE_STUS'],
      dtype='object')

### 범주형 변수 : category로 변환

In [74]:
cust_temp = cust[['CUST_ID', 'MATE_OCCP_NAME_G']]
cust_temp.groupby(['MATE_OCCP_NAME_G']).count()

Unnamed: 0_level_0,CUST_ID
MATE_OCCP_NAME_G,Unnamed: 1_level_1
1차산업 종사자,1411
2차산업 종사자,8107
3차산업 종사자,4011
고소득 전문직,560
공무원,2400
기업/단체 임원,931
기타,47270
단순 노무직,549
단순 사무직,1316
사무직,10053


In [75]:
col01 = pd.Series(cust['LT1Y_PEOD_RATE'].astype("category"), name='LT1Y_PEOD_RATE_C')
cust = pd.concat([cust, col01], axis=1)
cust['LT1Y_PEOD_RATE_C'].cat.categories = ["0","10","20","30","40","50","60","90","91"]
cust[['LT1Y_PEOD_RATE','LT1Y_PEOD_RATE_C']].head(10)

Unnamed: 0,LT1Y_PEOD_RATE,LT1Y_PEOD_RATE_C
0,20미만,20
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,10미만,10
7,0,0
8,10미만,10
9,10미만,10


In [76]:
col02 = pd.Series(cust['OCCP_NAME_G'].astype("category"), name='OCCP_NAME_C')
cust = pd.concat([cust, col02], axis=1)
cust['OCCP_NAME_C'].cat.categories = ["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16"]
cust[['OCCP_NAME_G','OCCP_NAME_C']].head(10)

Unnamed: 0,OCCP_NAME_G,OCCP_NAME_C
0,공무원,5
1,자영업,13
2,주부,15
3,학생,16
4,공무원,5
5,3차산업 종사자,3
6,주부,15
7,기업/단체 임원,6
8,주부,15
9,단순 사무직,9


In [77]:
col03 = pd.Series(cust['MATE_OCCP_NAME_G'].astype("category"), name='MATE_OCCP_NAME_C')
cust = pd.concat([cust, col03], axis=1)
cust['MATE_OCCP_NAME_C'].cat.categories = ["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16"]
cust[['MATE_OCCP_NAME_G','MATE_OCCP_NAME_C']].head(10)

Unnamed: 0,MATE_OCCP_NAME_G,MATE_OCCP_NAME_C
0,주부,15
1,주부,15
2,2차산업 종사자,2
3,기타,7
4,주부,15
5,단순 사무직,9
6,2차산업 종사자,2
7,사무직,10
8,전문직,14
9,기업/단체 임원,6


In [82]:
cust = cust.drop(['LT1Y_PEOD_RATE', 'OCCP_NAME_G', 'MATE_OCCP_NAME_G'], axis=1)

In [83]:
# 범주형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['SEX', 'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD', 'LINE_STUS'], dtype='object')

In [86]:
cate_var_c = cust.columns[cust.dtypes == 'category']
cate_var_c

Index(['LT1Y_PEOD_RATE_C', 'OCCP_NAME_C', 'MATE_OCCP_NAME_C'], dtype='object')

In [87]:
# 연속형변수
conti_var = cust.columns[(cust.dtypes != 'object') & (cust.dtypes != 'category')].difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'AGE', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT', 'CB_GUIF_CNT',
       'CNTT_LAMT_CNT', 'CPT_LNIF_AMT', 'CPT_LNIF_BIG', 'CPT_LNIF_RATIO',
       'CRDT_CARD', 'CRDT_CARD_CNT', 'CRDT_GRAD_DIFF', 'CRDT_LOAN_CNT',
       'CRDT_OCCR_MDIF', 'CRLN_30OVDU_RATE', 'CRLN_OVDU_RATE', 'CRMM_OVDU_AMT',
       'CTCD_OCCR_MDIF', 'CUST_FMLY_NUM', 'CUST_JOB_INCM', 'DTI', 'FAIL_COUNT',
       'FMLY_CLAM_CNT', 'FMLY_GDINS_MNPREM', 'FMLY_PLPY_CNT',
       'FMLY_SVINS_MNPREM', 'FMLY_TOT_PREM', 'FYCM_PAID_AMT', 'GDINS_MON_PREM',
       'HIGH_AMT_RATE', 'HSHD_INFR_INCM', 'LAST_CHLD_AGE', 'LNIF_CNT',
       'LOW_AMT_RATE', 'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT', 'LT1Y_MXOD_AMT',
       'LT1Y_SLOD_RATE', 'LT1Y_STLN_AMT', 'LTST_CRDT_GRAD', 'L_H_RATE',
       'MATE_JOB_INCM', 'MAX_MON_PREM', 'MIN_CNTT_DATE', 'MOBL_FATY_PRC',
       'MOBL_PRIN', 'MON_TLFE_AMT', 'NUM_DAY_SUSP', 'OVDU_HIGH_RATE',
       'PREM_OVDU_RATE', 'SPTCT_OCCR_MDIF', '

In [85]:
# 범주형 변수를 dummy 변수로 변환1
dummy_var = pd.get_dummies(cust[cate_var])
dummy_var.head()

Unnamed: 0,SEX_0,SEX_1,SEX_2,TEL_MBSP_GRAD_0,TEL_MBSP_GRAD_E,TEL_MBSP_GRAD_Q,TEL_MBSP_GRAD_R,TEL_MBSP_GRAD_W,CBPT_MBSP_YN_N,CBPT_MBSP_YN_Y,PAYM_METD_0,PAYM_METD_G,PAYM_METD_K,PAYM_METD_O,PAYM_METD_R,LINE_STUS_S,LINE_STUS_U
0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1
1,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1
2,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1
3,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0
4,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1


In [88]:
# 범주형 변수를 dummy 변수로 변환2
dummy_var_c = pd.get_dummies(cust[cate_var_c])
dummy_var_c.head()

Unnamed: 0,LT1Y_PEOD_RATE_C_0,LT1Y_PEOD_RATE_C_10,LT1Y_PEOD_RATE_C_20,LT1Y_PEOD_RATE_C_30,LT1Y_PEOD_RATE_C_40,LT1Y_PEOD_RATE_C_50,LT1Y_PEOD_RATE_C_60,LT1Y_PEOD_RATE_C_90,LT1Y_PEOD_RATE_C_91,OCCP_NAME_C_01,...,MATE_OCCP_NAME_C_07,MATE_OCCP_NAME_C_08,MATE_OCCP_NAME_C_09,MATE_OCCP_NAME_C_10,MATE_OCCP_NAME_C_11,MATE_OCCP_NAME_C_12,MATE_OCCP_NAME_C_13,MATE_OCCP_NAME_C_14,MATE_OCCP_NAME_C_15,MATE_OCCP_NAME_C_16
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [89]:
x_data = pd.concat([cust[conti_var], dummy_var, dummy_var_c], axis=1)
x_data.head()

Unnamed: 0,ACTL_FMLY_NUM,AGE,ARPU,AUTR_FAIL_MCNT,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,CB_GUIF_AMT,CB_GUIF_CNT,CNTT_LAMT_CNT,...,MATE_OCCP_NAME_C_07,MATE_OCCP_NAME_C_08,MATE_OCCP_NAME_C_09,MATE_OCCP_NAME_C_10,MATE_OCCP_NAME_C_11,MATE_OCCP_NAME_C_12,MATE_OCCP_NAME_C_13,MATE_OCCP_NAME_C_14,MATE_OCCP_NAME_C_15,MATE_OCCP_NAME_C_16
0,4,50,30000,10,493,450,0,420001,3,0,...,0,0,0,0,0,0,0,0,1,0
1,4,50,30000,0,22,81,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,4,60,30000,0,17,139,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,35,30000,0,0,1118,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4,45,50000,0,354,396,95,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [94]:
x_data.to_csv("data_transform.csv", encoding='utf-8', index=False)

In [95]:
cust = pd.read_csv('data_transform.csv', encoding='utf-8')
cust.shape

(100233, 126)

In [96]:
cust.head()

Unnamed: 0,ACTL_FMLY_NUM,AGE,ARPU,AUTR_FAIL_MCNT,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,CB_GUIF_AMT,CB_GUIF_CNT,CNTT_LAMT_CNT,...,MATE_OCCP_NAME_C_07,MATE_OCCP_NAME_C_08,MATE_OCCP_NAME_C_09,MATE_OCCP_NAME_C_10,MATE_OCCP_NAME_C_11,MATE_OCCP_NAME_C_12,MATE_OCCP_NAME_C_13,MATE_OCCP_NAME_C_14,MATE_OCCP_NAME_C_15,MATE_OCCP_NAME_C_16
0,4,50,30000,10,493,450,0,420001,3,0,...,0,0,0,0,0,0,0,0,1,0
1,4,50,30000,0,22,81,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,4,60,30000,0,17,139,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,35,30000,0,0,1118,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4,45,50000,0,354,396,95,0,0,0,...,0,0,0,0,0,0,0,0,1,0
