In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# Training set
cust = pd.read_csv('data_0915_u.csv')
cust.shape

(100233, 79)

In [3]:
# Test set
# cust = pd.read_csv('test_0930.csv', encoding='utf-8')
# cust.shape

In [4]:
cust.loc[pd.isnull(cust.LAST_CHLD_AGE), 'LAST_CHLD_AGE'] = 0
cust.loc[cust['OCCP_NAME_G'] == '*', 'OCCP_NAME_G'] = '기타'
cust.loc[pd.isnull(cust.MATE_OCCP_NAME_G), 'MATE_OCCP_NAME_G'] = '기타'
cust.loc[cust['SEX'] == '*', 'SEX'] = 0

### 대출건수 통합

In [5]:
loan_cnt = pd.Series(cust[['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT']].sum(axis=1), name='LNIF_CNT')
cust = pd.concat([cust, loan_cnt], axis=1)
cust = cust.drop(['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'ECT_LNIF_CNT'], axis=1)
# cust = cust.drop(['BNK_LNIF_CNT', 'CPT_LNIF_CNT', 'SPART_LNIF_CNT', 'ECT_LNIF_CNT'], axis=1)
# cust.head()

### 대출금액 통합

In [6]:
# 카드캐피탈대출 금액이 신용+은행대출 금액보다 많은지 여부
CPT_LNIF_BIG = pd.Series((cust['CPT_LNIF_AMT'] > cust['TOT_CLIF_AMT'] + cust['BNK_LNIF_AMT']) & (cust['CPT_LNIF_AMT'] > 1),  name='CPT_LNIF_BIG').astype(int)

# 카드캐피탈대출 금액의 총대출금액 대비 비율
CPT_LNIF_RATIO = pd.Series(pd.Series.round(cust['CPT_LNIF_AMT'] / cust['TOT_LNIF_AMT'], 3),  name='CPT_LNIF_RATIO')

cust = pd.concat([cust, CPT_LNIF_BIG, CPT_LNIF_RATIO], axis=1)
cust = cust.drop(['TOT_CLIF_AMT', 'BNK_LNIF_AMT'], axis=1)
# cust.head()

### DTI : 대출금액/추정소득

In [7]:
dti = pd.Series(np.round(cust['TOT_LNIF_AMT'] / cust['CUST_JOB_INCM'], 3))
cust['DTI'] = dti
cust = cust.replace(np.inf, np.nan)
cust = cust.fillna(value=0) 
# cust.head()

### add

In [8]:
# 최근1년보험료연체율
cust.loc[cust['LT1Y_PEOD_RATE'] == '0', 'LT1Y_PEOD_RATE_YN'] = 0
cust.loc[cust['LT1Y_PEOD_RATE'] != '0', 'LT1Y_PEOD_RATE_YN'] = 1

In [9]:
# 자동이체실패월수
cust.loc[cust['AUTR_FAIL_MCNT'] == 0, 'AUTR_FAIL_YN'] = 0
cust.loc[cust['AUTR_FAIL_MCNT'] != 0, 'AUTR_FAIL_YN'] = 1

In [10]:
# 납부일미준수횟수
cust.loc[cust['TLFE_UNPD_CNT'] == 0, 'TLFE_UNPD_CNT'] = 0
cust.loc[cust['TLFE_UNPD_CNT'] != 0, 'TLFE_UNPD_CNT'] = 1

In [11]:
# 실효해지건수
cust.loc[cust['CNTT_LAMT_CNT'] == 0, 'CNTT_LAMT_YN'] = 0
cust.loc[cust['CNTT_LAMT_CNT'] != 0, 'CNTT_LAMT_YN'] = 1

In [12]:
# 최근1년실효해지건수
cust.loc[cust['LT1Y_CTLT_CNT'] == 0, 'LT1Y_CTLT_YN'] = 0
cust.loc[cust['LT1Y_CTLT_CNT'] != 0, 'LT1Y_CTLT_YN'] = 1

In [13]:
# 정지일수
cust.loc[cust['NUM_DAY_SUSP'] == 0, 'NUM_DAY_SUSP_YN'] = 0
cust.loc[cust['NUM_DAY_SUSP'] != 0, 'NUM_DAY_SUSP_YN'] = 1

In [14]:
# 대출정보 현재 총 건수[2산업분류]
cust.loc[cust['SPART_LNIF_CNT'] < 2, 'SPART_LNIF_YN'] = 0
cust.loc[cust['SPART_LNIF_CNT'] != 0, 'SPART_LNIF_YN'] = 1

In [15]:
# cust = cust.drop(['AUTR_FAIL_MCNT','CNTT_LAMT_CNT','LT1Y_CTLT_CNT','NUM_DAY_SUSP'], axis=1)

### 범주형 변수 : category로 변환

In [16]:
# 범주형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['OCCP_NAME_G', 'MATE_OCCP_NAME_G', 'LT1Y_PEOD_RATE', 'SEX',
       'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD', 'LINE_STUS'],
      dtype='object')

In [17]:
cust_temp = cust[['CUST_ID', 'MATE_OCCP_NAME_G']]   # 직종 확인
cust_temp.groupby(['MATE_OCCP_NAME_G']).count()

Unnamed: 0_level_0,CUST_ID
MATE_OCCP_NAME_G,Unnamed: 1_level_1
1차산업 종사자,1411
2차산업 종사자,8107
3차산업 종사자,4011
고소득 전문직,560
공무원,2400
기업/단체 임원,931
기타,47270
단순 노무직,549
단순 사무직,1316
사무직,10053


In [18]:
# 최근1년보험료연체율 카테고리 타입으로 변환
col01 = pd.Series(cust['LT1Y_PEOD_RATE'].astype("category"), name='LT1Y_PEOD_RATE_C')
cust = pd.concat([cust, col01], axis=1)
cust['LT1Y_PEOD_RATE_C'].cat.categories = ["0","10","20","30","40","50","60","90","91"]
cust[['LT1Y_PEOD_RATE','LT1Y_PEOD_RATE_C']].head(10)

Unnamed: 0,LT1Y_PEOD_RATE,LT1Y_PEOD_RATE_C
0,20미만,20
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,10미만,10
7,0,0
8,10미만,10
9,10미만,10


In [19]:
# 직업 - 카테고리 타입으로 변환
col02 = pd.Series(cust['OCCP_NAME_G'].astype("category"), name='OCCP_NAME_C')
cust = pd.concat([cust, col02], axis=1)
cust['OCCP_NAME_C'].cat.categories = ["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16"]
cust[['OCCP_NAME_G','OCCP_NAME_C']].head(10)

Unnamed: 0,OCCP_NAME_G,OCCP_NAME_C
0,공무원,5
1,자영업,13
2,주부,15
3,학생,16
4,공무원,5
5,3차산업 종사자,3
6,주부,15
7,기업/단체 임원,6
8,주부,15
9,단순 사무직,9


In [20]:
# 배우자 직업 - 카테고리 타입으로 변환
col03 = pd.Series(cust['MATE_OCCP_NAME_G'].astype("category"), name='MATE_OCCP_NAME_C')
cust = pd.concat([cust, col03], axis=1)
cust['MATE_OCCP_NAME_C'].cat.categories = ["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16"]
cust[['MATE_OCCP_NAME_G','MATE_OCCP_NAME_C']].head(10)

Unnamed: 0,MATE_OCCP_NAME_G,MATE_OCCP_NAME_C
0,주부,15
1,주부,15
2,2차산업 종사자,2
3,기타,7
4,주부,15
5,단순 사무직,9
6,2차산업 종사자,2
7,사무직,10
8,전문직,14
9,기업/단체 임원,6


In [21]:
cust = cust.drop(['LT1Y_PEOD_RATE', 'OCCP_NAME_G', 'MATE_OCCP_NAME_G'], axis=1)

### 변수 통합

In [22]:
# object형 변수
cate_var = cust.columns[cust.dtypes == 'object']
cate_var

Index(['SEX', 'TEL_MBSP_GRAD', 'CBPT_MBSP_YN', 'PAYM_METD', 'LINE_STUS'], dtype='object')

In [23]:
# category형 변수
cate_var_c = cust.columns[cust.dtypes == 'category']
cate_var_c

Index(['LT1Y_PEOD_RATE_C', 'OCCP_NAME_C', 'MATE_OCCP_NAME_C'], dtype='object')

In [24]:
# 연속형변수
conti_var = cust.columns[(cust.dtypes != 'object') & (cust.dtypes != 'category')].difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'AGE', 'ARPU', 'AUTR_FAIL_MCNT', 'AUTR_FAIL_YN',
       'AVG_CALL_FREQ', 'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT',
       'CB_GUIF_CNT', 'CNTT_LAMT_CNT', 'CNTT_LAMT_YN', 'CPT_LNIF_AMT',
       'CPT_LNIF_BIG', 'CPT_LNIF_RATIO', 'CRDT_CARD', 'CRDT_CARD_CNT',
       'CRDT_GRAD_DIFF', 'CRDT_LOAN_CNT', 'CRDT_OCCR_MDIF', 'CRLN_30OVDU_RATE',
       'CRLN_OVDU_RATE', 'CRMM_OVDU_AMT', 'CTCD_OCCR_MDIF', 'CUST_FMLY_NUM',
       'CUST_JOB_INCM', 'DTI', 'FAIL_COUNT', 'FMLY_CLAM_CNT',
       'FMLY_GDINS_MNPREM', 'FMLY_PLPY_CNT', 'FMLY_SVINS_MNPREM',
       'FMLY_TOT_PREM', 'FYCM_PAID_AMT', 'GDINS_MON_PREM', 'HIGH_AMT_RATE',
       'HSHD_INFR_INCM', 'LAST_CHLD_AGE', 'LNIF_CNT', 'LOW_AMT_RATE',
       'LT1Y_CLOD_RATE', 'LT1Y_CTLT_CNT', 'LT1Y_CTLT_YN', 'LT1Y_MXOD_AMT',
       'LT1Y_PEOD_RATE_YN', 'LT1Y_SLOD_RATE', 'LT1Y_STLN_AMT',
       'LTST_CRDT_GRAD', 'L_H_RATE', 'MATE_JOB_INCM', 'MAX_MON_PREM',
       'MIN_CNTT_DATE', 'MOBL_FATY_PRC', 'MOBL_PRIN', 'MON_TLFE_AMT',
   

## feature extraction

In [25]:
var_list = conti_var.difference(['TARGET'])
totlen = len(var_list)
print(totlen)

74


In [26]:
for i in range(totlen):
    if i+1 < totlen:
        feature_name = 'dv_{}_{}'.format(i, i+1)
        m_data = list(zip(cust[var_list[i]], cust[var_list[i+1]]))
        if cust[var_list[i]][0] > cust[var_list[i+1]][0] :
            m_values = pd.Series([x/y if y > 0 else x for x, y in m_data])
            print(feature_name, len(m_values))
            cust[feature_name] = m_values
        else:
            m_values = pd.Series([y/x if x > 0 else y for x, y in m_data])
            print(feature_name, len(m_values))
            cust[feature_name] = m_values
            

dv_0_1 100233
dv_1_2 100233
dv_2_3 100233
dv_3_4 100233
dv_4_5 100233
dv_5_6 100233
dv_6_7 100233
dv_7_8 100233
dv_8_9 100233
dv_9_10 100233
dv_10_11 100233
dv_11_12 100233
dv_12_13 100233
dv_13_14 100233
dv_14_15 100233
dv_15_16 100233
dv_16_17 100233
dv_17_18 100233
dv_18_19 100233
dv_19_20 100233
dv_20_21 100233
dv_21_22 100233
dv_22_23 100233
dv_23_24 100233
dv_24_25 100233
dv_25_26 100233
dv_26_27 100233
dv_27_28 100233
dv_28_29 100233
dv_29_30 100233
dv_30_31 100233
dv_31_32 100233
dv_32_33 100233
dv_33_34 100233
dv_34_35 100233
dv_35_36 100233
dv_36_37 100233
dv_37_38 100233
dv_38_39 100233
dv_39_40 100233
dv_40_41 100233
dv_41_42 100233
dv_42_43 100233
dv_43_44 100233
dv_44_45 100233
dv_45_46 100233
dv_46_47 100233
dv_47_48 100233
dv_48_49 100233
dv_49_50 100233
dv_50_51 100233
dv_51_52 100233
dv_52_53 100233
dv_53_54 100233
dv_54_55 100233
dv_55_56 100233
dv_56_57 100233
dv_57_58 100233
dv_58_59 100233
dv_59_60 100233
dv_60_61 100233
dv_61_62 100233
dv_62_63 100233
dv_63_64 10

In [27]:
cust.tail()

Unnamed: 0,CUST_ID,TARGET,SPART_LNIF_CNT,TOT_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,CB_GUIF_CNT,...,dv_63_64,dv_64_65,dv_65_66,dv_66_67,dv_67_68,dv_68_69,dv_69_70,dv_70_71,dv_71_72,dv_72_73
100228,102248,0,2,75001,0,37,109,5,121,0,...,990000.0,0.020314,20111.0,0.0,1.0,1.0,75001.0,129312.068966,100000000.0,58000000.0
100229,102249,0,0,54001,0,85,0,6,121,0,...,0.0,20134.0,20134.0,0.0,5.0,5.0,54001.0,54001.0,37000000.0,37000000.0
100230,102250,0,0,27001,0,1,0,2,121,0,...,590000.0,0.034137,20141.0,0.0,8.0,8.0,27001.0,27001.0,45000000.0,45000000.0
100231,102251,0,1,57001,0,1,1,3,121,0,...,0.0,20103.0,20103.0,0.0,0.0,0.0,57001.0,116328.571429,2040816.0,1000000.0
100232,102252,0,0,18001,0,49,0,4,121,0,...,300000.0,0.06717,20151.0,0.0,0.0,0.0,18001.0,18001.0,22000000.0,22000000.0


In [28]:
# 추가된 연속형변수 확인
conti_var = cust.columns[(cust.dtypes != 'object') & (cust.dtypes != 'category')].difference(['CUST_ID'])
print(conti_var)
print(len(conti_var))

Index(['ACTL_FMLY_NUM', 'AGE', 'ARPU', 'AUTR_FAIL_MCNT', 'AUTR_FAIL_YN',
       'AVG_CALL_FREQ', 'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT',
       'CB_GUIF_CNT',
       ...
       'dv_67_68', 'dv_68_69', 'dv_69_70', 'dv_6_7', 'dv_70_71', 'dv_71_72',
       'dv_72_73', 'dv_7_8', 'dv_8_9', 'dv_9_10'],
      dtype='object', length=148)
148


In [29]:
# object형 변수를 dummy 변수로 변환
dummy_var = pd.get_dummies(cust[cate_var])
dummy_var.head()

Unnamed: 0,SEX_0,SEX_1,SEX_2,TEL_MBSP_GRAD_0,TEL_MBSP_GRAD_E,TEL_MBSP_GRAD_Q,TEL_MBSP_GRAD_R,TEL_MBSP_GRAD_W,CBPT_MBSP_YN_N,CBPT_MBSP_YN_Y,PAYM_METD_0,PAYM_METD_G,PAYM_METD_K,PAYM_METD_O,PAYM_METD_R,LINE_STUS_S,LINE_STUS_U
0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1
1,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1
2,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1
3,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0
4,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1


In [30]:
# category형 변수를 dummy 변수로 변환
dummy_var_c = pd.get_dummies(cust[cate_var_c])
dummy_var_c.head()

Unnamed: 0,LT1Y_PEOD_RATE_C_0,LT1Y_PEOD_RATE_C_10,LT1Y_PEOD_RATE_C_20,LT1Y_PEOD_RATE_C_30,LT1Y_PEOD_RATE_C_40,LT1Y_PEOD_RATE_C_50,LT1Y_PEOD_RATE_C_60,LT1Y_PEOD_RATE_C_90,LT1Y_PEOD_RATE_C_91,OCCP_NAME_C_01,...,MATE_OCCP_NAME_C_07,MATE_OCCP_NAME_C_08,MATE_OCCP_NAME_C_09,MATE_OCCP_NAME_C_10,MATE_OCCP_NAME_C_11,MATE_OCCP_NAME_C_12,MATE_OCCP_NAME_C_13,MATE_OCCP_NAME_C_14,MATE_OCCP_NAME_C_15,MATE_OCCP_NAME_C_16
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [31]:
x_data = pd.concat([cust[conti_var], dummy_var, dummy_var_c], axis=1)
x_data.head()

Unnamed: 0,ACTL_FMLY_NUM,AGE,ARPU,AUTR_FAIL_MCNT,AUTR_FAIL_YN,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,CB_GUIF_AMT,CB_GUIF_CNT,...,MATE_OCCP_NAME_C_07,MATE_OCCP_NAME_C_08,MATE_OCCP_NAME_C_09,MATE_OCCP_NAME_C_10,MATE_OCCP_NAME_C_11,MATE_OCCP_NAME_C_12,MATE_OCCP_NAME_C_13,MATE_OCCP_NAME_C_14,MATE_OCCP_NAME_C_15,MATE_OCCP_NAME_C_16
0,4,50,30000,10,1.0,493,450,0,420001,3,...,0,0,0,0,0,0,0,0,1,0
1,4,50,30000,0,0.0,22,81,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,4,60,30000,0,0.0,17,139,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,35,30000,0,0.0,0,1118,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4,45,50000,0,0.0,354,396,95,0,0,...,0,0,0,0,0,0,0,0,1,0


### 정리된 최종 데이터 저장

In [32]:
# Training set
x_data.to_csv("data_transform.csv", index=False)
cust = pd.read_csv('data_transform.csv')
# x_data.to_csv("data_transform.csv", encoding='utf-8', index=False)
# cust = pd.read_csv('data_transform.csv', encoding='utf-8')
cust.shape

(100233, 206)

In [33]:
# Test set
# x_data.to_csv("test_transform.csv", encoding='utf-8', index=False)
# cust = pd.read_csv('test_transform.csv', encoding='utf-8')
# cust.shape