In [100]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [101]:
FILEPATH_APPLICATION_TRAIN     = "../data/application_train.csv.f"
FILEPATH_APPLICATION_TEST      = "../data/application_test.csv.f"
FILEPATH_BUREAU                 = "../data/bureau.csv.f"
FILEPATH_BUREAU_BALANCE         = "../data/bureau_balance.csv.f"
FILEPATH_CREDIT_CARD_BALANCE   = "../data/credit_card_balance.csv.f"
FILEPATH_INSTALLMENTS_PAYMENTS = "../data/installments_payments.csv.f"
FILEPATH_POS_CASH_BALANCE      = "../data/POS_CASH_balance.csv.f"
FILEPATH_PREVIOUS_APPLICATION  = "../data/previous_application.csv.f"

In [102]:
df_bureau = pd.read_feather(FILEPATH_BUREAU)
df_bureau_balance = pd.read_feather(FILEPATH_BUREAU_BALANCE)

In [103]:
df_bureau.shape

(1716428, 17)

In [104]:
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


### unique value を見ておく

In [105]:
df_bureau["CREDIT_TYPE"].unique()

array(['Consumer credit', 'Credit card', 'Mortgage', 'Car loan',
       'Microloan', 'Loan for working capital replenishment',
       'Loan for business development', 'Real estate loan',
       'Unknown type of loan', 'Another type of loan',
       'Cash loan (non-earmarked)', 'Loan for the purchase of equipment',
       'Mobile operator loan', 'Interbank credit',
       'Loan for purchase of shares (margin lending)'], dtype=object)

In [106]:
df_bureau["CREDIT_ACTIVE"].unique()

array(['Closed', 'Active', 'Sold', 'Bad debt'], dtype=object)

In [107]:
for val in df_bureau['CREDIT_ACTIVE'].unique():
    print(val ,':', sum(df_bureau['CREDIT_ACTIVE'] == val))


Closed : 1079273
Active : 630607
Sold : 6527
Bad debt : 21


### エンコーディング

CREDIT_ACTIVE  の bad debt と sold はそれ単体で意味がありそうなので<br>
CREDIT_ACTIVE は one-hot Encoding をする。<br>

CREDIT_TYPE も、あとで割合をfeature にしたいので、 one-hot Encoding をする。

In [108]:
df_bureau = pd.get_dummies(df_bureau, columns=["CREDIT_ACTIVE", "CREDIT_TYPE"], prefix=["CA", "CT"])

In [109]:
df_bureau.columns

Index(['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_CURRENCY', 'DAYS_CREDIT',
       'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT',
       'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM',
       'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE',
       'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY', 'CA_Active', 'CA_Bad debt',
       'CA_Closed', 'CA_Sold', 'CT_Another type of loan', 'CT_Car loan',
       'CT_Cash loan (non-earmarked)', 'CT_Consumer credit', 'CT_Credit card',
       'CT_Interbank credit', 'CT_Loan for business development',
       'CT_Loan for purchase of shares (margin lending)',
       'CT_Loan for the purchase of equipment',
       'CT_Loan for working capital replenishment', 'CT_Microloan',
       'CT_Mobile operator loan', 'CT_Mortgage', 'CT_Real estate loan',
       'CT_Unknown type of loan'],
      dtype='object')

In [110]:
cleanup_nums = {"CREDIT_CURRENCY": {"currency 1": 1, "currency 2": 2, "currency 3": 3, "currency 4": 4}}

df_bureau.replace(cleanup_nums, inplace=True)

In [111]:
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,...,CT_Interbank credit,CT_Loan for business development,CT_Loan for purchase of shares (margin lending),CT_Loan for the purchase of equipment,CT_Loan for working capital replenishment,CT_Microloan,CT_Mobile operator loan,CT_Mortgage,CT_Real estate loan,CT_Unknown type of loan
0,215354,5714462,1,-497,0,-153.0,-153.0,,0,91323.0,...,0,0,0,0,0,0,0,0,0,0
1,215354,5714463,1,-208,0,1075.0,,,0,225000.0,...,0,0,0,0,0,0,0,0,0,0
2,215354,5714464,1,-203,0,528.0,,,0,464323.5,...,0,0,0,0,0,0,0,0,0,0
3,215354,5714465,1,-203,0,,,,0,90000.0,...,0,0,0,0,0,0,0,0,0,0
4,215354,5714466,1,-629,0,1197.0,,77674.5,0,2700000.0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
df_bureau_cnt = pd.DataFrame(df_bureau.groupby("SK_ID_CURR", as_index=False).size(), 
                            columns=['cnt']).reset_index()

In [120]:
df_bureau_cnt

Unnamed: 0,SK_ID_CURR,cnt
0,100001,7
1,100002,8
2,100003,4
3,100004,2
4,100005,3
5,100007,1
6,100008,3
7,100009,18
8,100010,2
9,100011,4


In [121]:
df_bureau_sum = df_bureau.groupby("SK_ID_CURR", as_index=False).agg(sum)


In [122]:
df_bureau_sum = pd.merge(df_bureau_sum, df_bureau_cnt, on='SK_ID_CURR', how='left')

In [123]:
df_bureau_sum.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,...,CT_Loan for business development,CT_Loan for purchase of shares (margin lending),CT_Loan for the purchase of equipment,CT_Loan for working capital replenishment,CT_Microloan,CT_Mobile operator loan,CT_Mortgage,CT_Real estate loan,CT_Unknown type of loan,cnt
0,100001,41276431,7,-5145,0,577.0,-3302.0,0.0,0,1453365.0,...,0,0,0,0,0,0,0,0,0,7
1,100002,49226177,8,-6992,0,-2094.0,-4185.0,8405.145,0,865055.565,...,0,0,0,0,0,0,0,0,0,8
2,100003,23543514,4,-5603,0,-2178.0,-3292.0,0.0,0,1017400.5,...,0,0,0,0,0,0,0,0,0,4
3,100004,13658267,2,-1734,0,-977.0,-1065.0,0.0,0,189037.8,...,0,0,0,0,0,0,0,0,0,2
4,100005,20205603,3,-572,0,1318.0,-123.0,0.0,0,657126.0,...,0,0,0,0,0,0,0,0,0,3
