In [15]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np

In [43]:
#define utilities functions

def remove_cat_outlier(df):
    """
    Replace XNA values in df which represents na in the categorical columns with n/a
    
    """
    categorical_col = [f for f in df.columns if df[f].dtype=="object"]
    for col in categorical_col:
        if df[col][df[col]=='XNA'].shape[0] > 0:
            df[col].replace('XNA', np.nan)
    return df

def downcast_dtypes(df):
    """
    Downsize float64/int64 with float32/int32 to save memory space
    """
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64"]]

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)

    return df

def one_hot_encoder(df, nan_as_category = True):
    """
    One-hot encoding for categorical columns with get_dummies
    """
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def process_pos_with_period(df,last_month_nums):
    '''
    Parameter:
    df: pos dataframe
    last_month_nums: last number of months
    
    Output:
    processed bureau balancing dataframe for specific period (month_nums)
    '''
    crit = df['MONTHS_BALANCE']>-last_month_nums
    df=df[crit]
    curr_column_header_list = [col for col in df.columns if 'SK_DPD' in col]
    new_column_header_list = ['POS_'+col+'_'+str(last_month_nums)+'MONTH' for col in df.columns if 'SK_DPD' in col]
    column_dict = dict(zip(curr_column_header_list,new_column_header_list))
    #print(column_dict)
    #print(df.columns)
    df.rename(columns=column_dict,inplace=True)
    #print(df.columns)
    aggregation = dict(zip(new_column_header_list,['mean']*len(new_column_header_list)))
    print(aggregation)
    df = df.groupby('curr_app_id').agg(aggregation).reset_index()
    return df


In [29]:
pos = pd.read_csv('../input/POS_CASH_balance.zip')

In [30]:
#replace XNA with na in categorical variable: NAME_CONTRACT_STATUS
remove_cat_outlier(pos)

Unnamed: 0,prev_app_id,curr_app_id,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,5319600,657807,-8,2.0,0.0,Completed,0,0
1,5319600,657807,-10,12.0,12.0,Active,0,0
2,5319600,657807,-9,12.0,11.0,Active,0,0
3,5697773,555192,-52,6.0,2.0,Active,0,0
4,5697773,555192,-51,6.0,1.0,Active,0,0
...,...,...,...,...,...,...,...,...
10001353,5870789,760995,-37,60.0,57.0,Active,0,0
10001354,5870789,760995,-34,60.0,54.0,Active,0,0
10001355,5870789,760995,-30,10.0,0.0,Completed,0,0
10001356,5870789,760995,-38,60.0,58.0,Active,0,0


In [31]:
pos, categorical_cols = one_hot_encoder(pos, nan_as_category= True)

In [32]:
pos.info(verbose=True,memory_usage=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 17 columns):
 #   Column                                      Non-Null Count     Dtype  
---  ------                                      --------------     -----  
 0   prev_app_id                                 10001358 non-null  int64  
 1   curr_app_id                                 10001358 non-null  int64  
 2   MONTHS_BALANCE                              10001358 non-null  int64  
 3   CNT_INSTALMENT                              9975287 non-null   float64
 4   CNT_INSTALMENT_FUTURE                       9975271 non-null   float64
 5   SK_DPD                                      10001358 non-null  int64  
 6   SK_DPD_DEF                                  10001358 non-null  int64  
 7   NAME_CONTRACT_STATUS_Active                 10001358 non-null  uint8  
 8   NAME_CONTRACT_STATUS_Amortized debt         10001358 non-null  uint8  
 9   NAME_CONTRACT_STATUS_Approved               

## Feature Engineering

create addtional features based on different aggregation types on numerical/categorical columns

In [33]:
numerical_aggregations = {
    'MONTHS_BALANCE': ['max', 'mean', 'count'],
    'CNT_INSTALMENT': ['mean'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}
categorical_aggregations={}
for col in categorical_cols:
    categorical_aggregations[col]=['mean']
aggregations = {**numerical_aggregations,**categorical_aggregations}
print(aggregations)

{'MONTHS_BALANCE': ['max', 'mean', 'count'], 'CNT_INSTALMENT': ['mean'], 'SK_DPD': ['max', 'mean'], 'SK_DPD_DEF': ['max', 'mean'], 'NAME_CONTRACT_STATUS_Active': ['mean'], 'NAME_CONTRACT_STATUS_Amortized debt': ['mean'], 'NAME_CONTRACT_STATUS_Approved': ['mean'], 'NAME_CONTRACT_STATUS_Canceled': ['mean'], 'NAME_CONTRACT_STATUS_Completed': ['mean'], 'NAME_CONTRACT_STATUS_Demand': ['mean'], 'NAME_CONTRACT_STATUS_Returned to the store': ['mean'], 'NAME_CONTRACT_STATUS_Signed': ['mean'], 'NAME_CONTRACT_STATUS_XNA': ['mean'], 'NAME_CONTRACT_STATUS_nan': ['mean']}


In [34]:
pos_agg = pos.groupby('curr_app_id').agg(aggregations)
pos_agg.columns = ['POS_' + col[0] + "_" + col[1].upper() for col in pos_agg.columns.tolist()]

In [35]:
pos_agg.head()

Unnamed: 0_level_0,POS_MONTHS_BALANCE_MAX,POS_MONTHS_BALANCE_MEAN,POS_MONTHS_BALANCE_COUNT,POS_CNT_INSTALMENT_MEAN,POS_SK_DPD_MAX,POS_SK_DPD_MEAN,POS_SK_DPD_DEF_MAX,POS_SK_DPD_DEF_MEAN,POS_NAME_CONTRACT_STATUS_Active_MEAN,POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN,POS_NAME_CONTRACT_STATUS_Approved_MEAN,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,POS_NAME_CONTRACT_STATUS_Completed_MEAN,POS_NAME_CONTRACT_STATUS_Demand_MEAN,POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN,POS_NAME_CONTRACT_STATUS_Signed_MEAN,POS_NAME_CONTRACT_STATUS_XNA_MEAN,POS_NAME_CONTRACT_STATUS_nan_MEAN
curr_app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
500000,-2,-7.545455,11,14.090909,0,0.0,0,0.0,0.909091,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0
500002,-72,-76.5,10,10.0,0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
500005,-79,-81.0,5,4.0,0,0.0,0,0.0,0.8,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0
500006,-19,-22.0,7,11.142857,0,0.0,0,0.0,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0
500007,-14,-17.0,7,6.0,0,0.0,0,0.0,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0


### How does the client perform on previous loans in the last 6, 12, 18 months?

In [44]:
pos_last_6_months = process_pos_with_period(pos,6)
pos_last_12_months = process_pos_with_period(pos,12)
pos_last_18_months = process_pos_with_period(pos,18)

{'POS_SK_DPD_6MONTH': 'mean', 'POS_SK_DPD_DEF_6MONTH': 'mean'}
{'POS_SK_DPD_12MONTH': 'mean', 'POS_SK_DPD_DEF_12MONTH': 'mean'}
{'POS_SK_DPD_18MONTH': 'mean', 'POS_SK_DPD_DEF_18MONTH': 'mean'}


### How long ago is the client's last DPD?

In [37]:
pos_last_DPD_month = pos[pos['SK_DPD']>0].groupby(['curr_app_id'])['MONTHS_BALANCE'].max().reset_index()
pos_last_DPD_month.rename(columns={'MONTHS_BALANCE':'MONTH_LAST_DPD'},inplace=True)
pos_last_DPD_month.head(5)

Unnamed: 0,curr_app_id,MONTH_LAST_DPD
0,500009,-43
1,500012,-84
2,500013,-59
3,500020,-69
4,500024,-48


In [38]:
pos_agg = pos_agg.merge(pos_last_6_months,left_on='curr_app_id',right_on='curr_app_id',how='left')
pos_agg = pos_agg.merge(pos_last_12_months,left_on='curr_app_id',right_on='curr_app_id',how='left')
pos_agg = pos_agg.merge(pos_last_18_months,left_on='curr_app_id',right_on='curr_app_id',how='left')
pos_agg = pos_agg.merge(pos_last_DPD_month,left_on='curr_app_id',right_on='curr_app_id',how='left')

In [39]:
downcast_dtypes(pos_agg)
pos_agg.info(verbose=True,memory_usage=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 337252 entries, 0 to 337251
Data columns (total 26 columns):
 #   Column                                               Non-Null Count   Dtype  
---  ------                                               --------------   -----  
 0   curr_app_id                                          337252 non-null  int32  
 1   POS_MONTHS_BALANCE_MAX                               337252 non-null  int32  
 2   POS_MONTHS_BALANCE_MEAN                              337252 non-null  float32
 3   POS_MONTHS_BALANCE_COUNT                             337252 non-null  int32  
 4   POS_CNT_INSTALMENT_MEAN                              337224 non-null  float32
 5   POS_SK_DPD_MAX                                       337252 non-null  int32  
 6   POS_SK_DPD_MEAN                                      337252 non-null  float32
 7   POS_SK_DPD_DEF_MAX                                   337252 non-null  int32  
 8   POS_SK_DPD_DEF_MEAN                                  3

In [40]:
pos_agg.to_csv('../staging/pos_cash_balance.csv', index=False, compression='zip')