In [1]:
import numpy as np
import pandas as pd
import gzip
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns

%matplotlib inline

### user

In [None]:
#데이터 읽어오기
with gzip.open('../preprocessed/user_final.pickle','rb') as f:
    user = pickle.load(f)

user.head()

In [3]:
len(user.columns.unique())

326

In [4]:
len(user.columns)

326

In [5]:
# final features for user data (shap+KBest)
final_user_shap = ['income_type', 'employment_type', 'purpose',
            'credit_score', 'desired_amount', 'purpose_by_income_type_SWITCHLOAN', 
            'insert_hour', 'existing_loan_amt', 'existing_loan_amt_by_credit_score_cut_quantile_first', 
            'existing_loan_cnt', 'income_type', 'desired_amount_by_income_type_std', 
            'existing_loan_amt_by_purpose_max', 'insert_day', 'gender', 'age', 'insert_month', 
            'yearly_income_by_houseown_type_mean', 'purpose_by_income_type_BUYCAR', 'employment_type']

len(final_user_shap)

20

In [6]:
# 최종적으로 사용할 변수만 데이터프레임에 연결
user = user[['user_id', 'application_id'] + final_user_shap]
len(user.columns)

22

In [None]:
user.head()

In [8]:
user.shape

(723526, 22)

### loan

In [None]:
#데이터 읽어오기
with gzip.open('../preprocessed/loan_info_final.pickle','rb') as f:
    loan = pickle.load(f)

loan.head()

In [None]:
loan.dropna(how='any', axis=0, inplace=True)
loan.shape

In [None]:
loan.isna().sum()

In [12]:
final_loan_shap = ['loan_limit_by_application_id_sum', 'loan_rate', 'loan_limit_per_desired_amount',
            'loan_rate_by_application_id_min', 'loan_limit', 'loan_rate_by_application_id_quantile_first',
            'loan_rate_by_product_id_quantile_first', 'loan_rate_by_application_id_std', 'loan_rate_by_product_id_min',
            'loan_rate_by_application_id_quantile_second', 'loan_rate_by_application_id_mean',
            'loan_rate_by_application_id_max', 'desired_amount', 'loan_rate_by_bank_id_max', 'loan_rate_by_product_id_std',
            'loan_rate_by_product_id_quantile_second', 'loan_rate_by_product_id_mean', 'loan_rate_by_product_id_quantile_third',
            'loan_rate_by_bank_id_mean', 'loan_rate_by_application_id_quantile_third']
            
len(final_loan_shap)

20

In [13]:
# 최종적으로 선택된 변수만 붙이기
loan = loan[['application_id'] + final_loan_shap + ['is_applied']]
len(loan.columns)

22

In [None]:
loan.shape

In [15]:
loan.head()

Unnamed: 0,application_id,loan_limit_by_application_id_sum,loan_rate,loan_limit_per_desired_amount,loan_rate_by_application_id_min,loan_limit,loan_rate_by_application_id_quantile_first,loan_rate_by_product_id_quantile_first,loan_rate_by_application_id_std,loan_rate_by_product_id_min,...,loan_rate_by_application_id_max,desired_amount,loan_rate_by_bank_id_max,loan_rate_by_product_id_std,loan_rate_by_product_id_quantile_second,loan_rate_by_product_id_mean,loan_rate_by_product_id_quantile_third,loan_rate_by_bank_id_mean,loan_rate_by_application_id_quantile_third,is_applied
1,576643,14000000.0,16.5,0.55,16.5,11000000.0,17.375,13.5,2.474874,8.5,...,20.0,20000000.0,18.5,2.317516,15.5,14.79409,16.5,13.29785,19.125,0.0
2,576643,14000000.0,20.0,0.15,16.5,3000000.0,17.375,16.4,2.474874,16.4,...,20.0,20000000.0,20.0,1.030253,17.1,17.273883,17.5,15.518372,19.125,0.0
3,2136706,927000000.0,13.5,0.125,6.2,10000000.0,11.8,10.9,2.970944,8.4,...,19.9,80000000.0,18.0,2.083258,13.1,12.737339,13.9,12.333438,16.0,0.0
4,2136706,927000000.0,15.9,0.275,6.2,22000000.0,11.8,16.9,2.970944,14.9,...,19.9,80000000.0,19.9,1.456557,17.9,18.11252,18.9,15.613676,16.0,0.0
5,2136706,927000000.0,18.4,0.125,6.2,10000000.0,11.8,15.5,2.970944,13.7,...,19.9,80000000.0,19.4,1.334597,16.5,16.642952,17.5,14.122311,16.0,0.0


### user + loan (merge the two DataFrames)

In [16]:
# index 통일
user.set_index(['application_id'], inplace=True)
loan.set_index(['application_id'], inplace=True)

In [17]:
final_data = pd.merge(left=user, right=loan, left_index=True, right_index=True, how="right")
final_data.shape # (10132997,)랑 같아야

(10132997, 42)

In [None]:
final_data.isna().sum()

In [27]:
user.index.unique().sort_values()

Int64Index([      2,       3,       6,      14,      17,      19,      20,
                 22,      34,      36,
            ...
            2167798, 2167803, 2167805, 2167806, 2167809, 2167811, 2167816,
            2167817, 2167819, 2167826],
           dtype='int64', name='application_id', length=723526)

In [None]:
final_data['is_applied'].value_counts()

In [None]:
# 단 하나의 null 값이라도 포함된 행은 모두 제거
final_data = final_data.dropna(how='any',axis=0)
final_data.shape

In [None]:
# 행 제거한 이후 count
final_data['is_applied'].value_counts()

In [31]:
# 최종 데이터 저장
with gzip.open('../preprocessed/final_data.pickle','wb') as f:
    pickle.dump(final_data, f, pickle.HIGHEST_PROTOCOL)