### Explore Credit Card Balance Dataset
https://www.kaggle.com/c/home-credit-default-risk

Ad-hoc exploration of `credit_card_balance.csv` dataset to find numeric and categorical variables

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
import shutil

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from preprocess import (do_data_cleaning, generate_encoders, add_onehot_col, add_label_col, fix_null_values)
from preprocess_bureau import (add_numeric_stats_cols, )

In [3]:
from IPython.display import display

In [4]:
df = pd.read_csv('./data/credit_card_balance.csv')
print("shape",df.shape)
print("columns",df.columns)

shape (3840312, 23)
columns Index(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE',
       'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT',
       'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
       'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY',
       'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT',
       'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE',
       'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT',
       'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS', 'SK_DPD',
       'SK_DPD_DEF'],
      dtype='object')


In [9]:
df.dtypes

SK_ID_PREV                      int64
SK_ID_CURR                      int64
MONTHS_BALANCE                  int64
AMT_BALANCE                   float64
AMT_CREDIT_LIMIT_ACTUAL         int64
AMT_DRAWINGS_ATM_CURRENT      float64
AMT_DRAWINGS_CURRENT          float64
AMT_DRAWINGS_OTHER_CURRENT    float64
AMT_DRAWINGS_POS_CURRENT      float64
AMT_INST_MIN_REGULARITY       float64
AMT_PAYMENT_CURRENT           float64
AMT_PAYMENT_TOTAL_CURRENT     float64
AMT_RECEIVABLE_PRINCIPAL      float64
AMT_RECIVABLE                 float64
AMT_TOTAL_RECEIVABLE          float64
CNT_DRAWINGS_ATM_CURRENT      float64
CNT_DRAWINGS_CURRENT            int64
CNT_DRAWINGS_OTHER_CURRENT    float64
CNT_DRAWINGS_POS_CURRENT      float64
CNT_INSTALMENT_MATURE_CUM     float64
NAME_CONTRACT_STATUS           object
SK_DPD                          int64
SK_DPD_DEF                      int64
dtype: object

In [8]:
df.select_dtypes('object').apply(pd.Series.nunique, axis=0)

NAME_CONTRACT_STATUS    7
dtype: int64

In [17]:
# df.select_dtypes(['float64', 'int64']).agg(["min", "max", "mean", "median", "nunique"])
df.select_dtypes(['float64', 'int64']).apply(pd.Series.nunique, axis=0).to_dict().keys()

dict_keys(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM', 'SK_DPD', 'SK_DPD_DEF'])

In [13]:
cols = 


df['CNT_DRAWINGS_OTHER_CURRENT'].unique()

array([ 0.,  1., nan,  3.,  2.,  4.,  6.,  5.,  7.,  8., 10., 12.])

In [19]:
numerical_cols =['MONTHS_BALANCE', 'AMT_BALANCE', 
        'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 
        'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 
        'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 
        'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT',
        'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 
        'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT', 
        'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 
        'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM']

FEAT_CODE = "CCB"
rename_cols = {}
for colname in numerical_cols:
    rename_cols[colname] = "{}_{}".format(FEAT_CODE, colname)

df[numerical_cols].agg(["min", "max", "mean", "median", "nunique"])

Unnamed: 0,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM
min,-96.0,-420250.2,0.0,-6827.31,-6211.62,0.0,0.0,0.0,0.0,0.0,-423305.8,-420250.2,-420250.2,0.0,0.0,0.0,0.0,0.0
max,-1.0,1505902.0,1350000.0,2115000.0,2287098.0,1529847.0,2239274.0,202882.005,4289207.0,4278316.0,1472317.0,1493338.0,1493338.0,51.0,165.0,12.0,165.0,120.0
mean,-34.521921,58300.16,153808.0,5961.325,7433.388,288.1696,2968.805,3540.204129,10280.54,7588.857,55965.88,58088.81,58098.29,0.309449,0.703144,0.004812,0.559479,20.825084
median,-28.0,0.0,112500.0,0.0,0.0,0.0,0.0,0.0,2702.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
nunique,96.0,1347904.0,181.0,2267.0,187005.0,1832.0,168748.0,312266.0,163209.0,182957.0,1195839.0,1338878.0,1339008.0,44.0,129.0,11.0,133.0,121.0
