In [28]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [29]:
FILEPATH_CREDIT_CARD_BALANCE   = "../data/credit_card_balance.csv.f"

In [30]:
df_credit_balance = pd.read_feather(FILEPATH_CREDIT_CARD_BALANCE)

In [31]:
df_credit_balance.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [32]:
df_credit_balance.NAME_CONTRACT_STATUS.unique()

array(['Active', 'Completed', 'Demand', 'Signed', 'Sent proposal',
       'Refused', 'Approved'], dtype=object)

In [33]:
df_credit_balance_ec = pd.get_dummies(df_credit_balance)[['SK_ID_CURR', 
                                                            'NAME_CONTRACT_STATUS_Completed', 
                                                            'NAME_CONTRACT_STATUS_Demand',
                                                            'NAME_CONTRACT_STATUS_Signed',
                                                            'NAME_CONTRACT_STATUS_Sent proposal',
                                                            'NAME_CONTRACT_STATUS_Refused',
                                                            'NAME_CONTRACT_STATUS_Approved'
                                                                 ]]
df_credit_balance_ec_sum = df_credit_balance_ec.groupby('SK_ID_CURR', as_index=False).sum()
df_credit_balance_ec_avg = df_credit_balance_ec.groupby('SK_ID_CURR', as_index=False).sum()

### rename columns
df_credit_balance_ec_sum.columns = ['sum_' + f_ for f_ in df_credit_balance_ec_sum.columns]
df_credit_balance_ec_sum.rename(columns={'sum_SK_ID_CURR': 'SK_ID_CURR'}, inplace=True)

df_credit_balance_ec_avg.columns = ['avg_' + f_ for f_ in df_credit_balance_ec_avg.columns]
df_credit_balance_ec_avg.rename(columns={'avg_SK_ID_CURR': 'SK_ID_CURR'}, inplace=True)



In [34]:
### count the number of previous applications for a given ID
nb_prevs = df_credit_balance[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
df_credit_balance['SK_ID_PREV'] = df_credit_balance['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])

### average of all other columns 
avg_cc_bal = df_credit_balance.groupby('SK_ID_CURR', as_index=False).mean()

### merge categorical features
avg_cc_bal = pd.merge(avg_cc_bal, df_credit_balance_ec_sum, on='SK_ID_CURR', how='left')
avg_cc_bal = pd.merge(avg_cc_bal, df_credit_balance_ec_avg, on='SK_ID_CURR', how='left')

### rename columns
avg_cc_bal.columns = ['cc_bal_' + f_ for f_ in avg_cc_bal.columns]



In [35]:
avg_cc_bal.rename(columns={'cc_bal_SK_ID_CURR': 'SK_ID_CURR'}, inplace=True)

In [36]:
avg_cc_bal.to_feather('../data/prev_credit_card_balance_feature_0819_1.csv.f')

In [37]:
avg_cc_bal.head()

Unnamed: 0,SK_ID_CURR,cc_bal_SK_ID_PREV,cc_bal_MONTHS_BALANCE,cc_bal_AMT_BALANCE,cc_bal_AMT_CREDIT_LIMIT_ACTUAL,cc_bal_AMT_DRAWINGS_ATM_CURRENT,cc_bal_AMT_DRAWINGS_CURRENT,cc_bal_AMT_DRAWINGS_OTHER_CURRENT,cc_bal_AMT_DRAWINGS_POS_CURRENT,cc_bal_AMT_INST_MIN_REGULARITY,...,cc_bal_sum_NAME_CONTRACT_STATUS_Signed,cc_bal_sum_NAME_CONTRACT_STATUS_Sent proposal,cc_bal_sum_NAME_CONTRACT_STATUS_Refused,cc_bal_sum_NAME_CONTRACT_STATUS_Approved,cc_bal_avg_NAME_CONTRACT_STATUS_Completed,cc_bal_avg_NAME_CONTRACT_STATUS_Demand,cc_bal_avg_NAME_CONTRACT_STATUS_Signed,cc_bal_avg_NAME_CONTRACT_STATUS_Sent proposal,cc_bal_avg_NAME_CONTRACT_STATUS_Refused,cc_bal_avg_NAME_CONTRACT_STATUS_Approved
0,100006,6.0,-3.5,0.0,270000.0,,0.0,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,100011,74.0,-38.5,54482.111149,164189.189189,2432.432432,2432.432432,0.0,0.0,3956.221849,...,0,0,0,0,0,0,0,0,0,0
2,100013,96.0,-48.5,18159.919219,131718.75,6350.0,5953.125,0.0,0.0,1454.539551,...,0,0,0,0,0,0,0,0,0,0
3,100021,17.0,-10.0,0.0,675000.0,,0.0,,,0.0,...,0,0,0,0,10,0,0,0,0,0
4,100023,8.0,-7.5,0.0,135000.0,,0.0,,,0.0,...,0,0,0,0,0,0,0,0,0,0
