In [4]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
FILEPATH_POS_CASH_BALANCE      = "../data/POS_CASH_balance.csv.f"

In [6]:
df_pos_cash_balance = pd.read_feather(FILEPATH_POS_CASH_BALANCE)

In [7]:
df_pos_cash_balance_ec = pd.get_dummies(df_pos_cash_balance)[['SK_ID_CURR', 
                                                            'NAME_CONTRACT_STATUS_Amortized debt', 
                                                            'NAME_CONTRACT_STATUS_Approved',
                                                            'NAME_CONTRACT_STATUS_Canceled',
                                                            'NAME_CONTRACT_STATUS_Completed',
                                                            'NAME_CONTRACT_STATUS_Demand',
                                                            'NAME_CONTRACT_STATUS_Returned to the store',
                                                            'NAME_CONTRACT_STATUS_Signed'
                                                                 ]]
df_pos_cash_balance_ec_sum = df_pos_cash_balance_ec.groupby('SK_ID_CURR', as_index=False).sum()
df_pos_cash_balance_ec_avg = df_pos_cash_balance_ec.groupby('SK_ID_CURR', as_index=False).mean()

In [20]:
def rename_col(df, add_str):
    for col in df.columns:
        if col not in 'SK_ID_CURR':
            new_col = col + add_str
            df.rename(columns={col:new_col}, inplace=True)

rename_col(df_pos_cash_balance_ec_sum, '_sum')
rename_col(df_pos_cash_balance_ec_avg, '_avg')


In [21]:
df_pos_cash_balance_ec_sum.columns
df_pos_cash_balance_ec_avg.columns

Index(['SK_ID_CURR', 'NAME_CONTRACT_STATUS_Amortized debt_avg',
       'NAME_CONTRACT_STATUS_Approved_avg',
       'NAME_CONTRACT_STATUS_Canceled_avg',
       'NAME_CONTRACT_STATUS_Completed_avg', 'NAME_CONTRACT_STATUS_Demand_avg',
       'NAME_CONTRACT_STATUS_Returned to the store_avg',
       'NAME_CONTRACT_STATUS_Signed_avg'],
      dtype='object')

In [22]:
### count the number of pos cash for a given ID
pcb_count = df_pos_cash_balance[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
df_pos_cash_balance['SK_ID_PREV'] = df_pos_cash_balance['SK_ID_CURR'].map(pcb_count['SK_ID_PREV'])

## Average Values for all other variables in pos cash
pcb_avg = df_pos_cash_balance.groupby('SK_ID_CURR', as_index=False).mean()
pcb_avg = pd.merge(pcb_avg, df_pos_cash_balance_ec_sum, on = 'SK_ID_CURR', how='left')
pcb_avg = pd.merge(pcb_avg, df_pos_cash_balance_ec_avg, on = 'SK_ID_CURR', how='left')
pcb_avg.columns = ['pcb_' + f_ for f_ in pcb_avg.columns]


In [23]:
pcb_avg.rename(columns={'pcb_SK_ID_CURR': 'SK_ID_CURR'}, inplace=True)

In [24]:
pcb_avg.to_feather('../data/POS_CASH_balance_feature_0824_1.csv.f')

In [25]:
pcb_avg.head()

Unnamed: 0,SK_ID_CURR,pcb_SK_ID_PREV,pcb_MONTHS_BALANCE,pcb_CNT_INSTALMENT,pcb_CNT_INSTALMENT_FUTURE,pcb_SK_DPD,pcb_SK_DPD_DEF,pcb_NAME_CONTRACT_STATUS_Amortized debt_sum_sum,pcb_NAME_CONTRACT_STATUS_Approved_sum_sum,pcb_NAME_CONTRACT_STATUS_Canceled_sum_sum,...,pcb_NAME_CONTRACT_STATUS_Demand_sum_sum,pcb_NAME_CONTRACT_STATUS_Returned to the store_sum_sum,pcb_NAME_CONTRACT_STATUS_Signed_sum_sum,pcb_NAME_CONTRACT_STATUS_Amortized debt_avg,pcb_NAME_CONTRACT_STATUS_Approved_avg,pcb_NAME_CONTRACT_STATUS_Canceled_avg,pcb_NAME_CONTRACT_STATUS_Completed_avg,pcb_NAME_CONTRACT_STATUS_Demand_avg,pcb_NAME_CONTRACT_STATUS_Returned to the store_avg,pcb_NAME_CONTRACT_STATUS_Signed_avg
0,100001,9.0,-72.555556,4.0,1.444444,0.777778,0.777778,0,0,0,...,0,0,0,0.0,0.0,0.0,0.222222,0.0,0.0,0.0
1,100002,19.0,-10.0,24.0,15.0,0.0,0.0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100003,28.0,-43.785714,10.107143,5.785714,0.0,0.0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0
3,100004,4.0,-25.5,3.75,2.25,0.0,0.0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,100005,11.0,-20.0,11.7,7.2,0.0,0.0,0,0,0,...,0,0,1,0.0,0.0,0.0,0.090909,0.0,0.0,0.090909
