In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np

In [2]:
#define utilities functions
def remove_num_outlier(df):
    """
    Replace 365243 with np.nan in date columns of df
    
    """
    for col in df.columns:
        if "DAYS" in col:
            df[col] = df[col].replace({365243: np.nan})

    return df

def remove_cat_outlier(df):
    """
    Replace XNA values in df which represents na in the categorical columns with n/a
    
    """
    categorical_col = [f for f in df.columns if df[f].dtype=="object"]
    for col in categorical_col:
        if df[col][df[col]=='XNA'].shape[0] > 0:
            df[col].replace('XNA', np.nan)
    return df

def downcast_dtypes(df):
    """
    Downsize float64/int64 with float32/int32 to save memory space
    """
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64"]]

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)

    return df

def one_hot_encoder(df, nan_as_category = True):
    """
    One-hot encoding for categorical columns with get_dummies
    """
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def remove_missing_col_with_threshold(df,threshold=0.6):
    """
    remove columns that has missing values more than threshold%
    """
    miss_data = pd.DataFrame((df.isnull().sum())*100/df.shape[0])
    miss_data_col=miss_data[miss_data[0]>threshold].index
    df_new  = df[[i for i in df.columns if i not in miss_data_col]]
    return df_new


In [3]:
prev_app = pd.read_csv('../input/previous_application.zip')

In [4]:
prev_app, categorical_cols = one_hot_encoder(prev_app, nan_as_category= True)

### Missing data and outier Treatment

In [5]:
#replace 365243 with na in numerical columns
remove_num_outlier(prev_app)
#replace XNA with na in the categorical columns
remove_cat_outlier(prev_app)

Unnamed: 0,prev_app_id,curr_app_id,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,...,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PRODUCT_COMBINATION_nan
0,5319600,657807,6404.310,58905.000,65124.00,0.000,58905.000,8,1,0.000000,...,0,0,0,1,0,0,0,0,0,0
1,5697773,555192,6264.000,39145.500,35230.50,3915.000,39145.500,8,1,0.108922,...,0,1,0,0,0,0,0,0,0,0
2,4742836,578775,4951.350,47056.275,52641.00,4.275,47056.275,13,1,0.000088,...,0,0,1,0,0,0,0,0,0,0
3,4114563,740578,3391.110,35144.370,30586.50,7032.870,35144.370,9,1,0.203603,...,0,0,1,0,0,0,0,0,0,0
4,4236686,852791,14713.605,123486.075,120307.50,12349.575,123486.075,13,1,0.101388,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670209,4094373,710347,5009.400,108180.000,108180.00,0.000,108180.000,17,1,0.000000,...,0,0,1,0,0,0,0,0,0,0
1670210,5345589,563998,11269.575,80995.185,59503.50,24299.685,80995.185,18,1,0.315794,...,0,1,0,0,0,0,0,0,0,0
1670211,5963924,654249,8797.680,79416.900,60673.50,22500.900,79416.900,19,1,0.294628,...,0,1,0,0,0,0,0,0,0,0
1670212,4832310,887003,,41499.000,41499.00,0.000,41499.000,10,1,0.000000,...,0,0,0,0,0,1,0,0,0,0


## Feature engineering
### Add new features

In [6]:
prev_app['APP_CREDIT_PERC'] = prev_app['AMT_APPLICATION'] / prev_app['AMT_CREDIT']
prev_app['AMT_CREDIT_GOODS_PERC'] = prev_app['AMT_CREDIT'] / prev_app['AMT_GOODS_PRICE']
prev_app['AMT_PAY_YEAR'] = prev_app['AMT_CREDIT'] / prev_app['AMT_ANNUITY']

### Data Aggregation

In [7]:
num_features = [f_ for f_ in prev_app.columns.values if (f_.find('DAYS')>=0) | (f_.find('RATE')>=0) | (f_.find('AMT')>=0)]
num_features

['AMT_ANNUITY',
 'AMT_APPLICATION',
 'AMT_CREDIT',
 'AMT_DOWN_PAYMENT',
 'AMT_GOODS_PRICE',
 'RATE_DOWN_PAYMENT',
 'RATE_INTEREST_PRIMARY',
 'RATE_INTEREST_PRIVILEGED',
 'DAYS_DECISION',
 'DAYS_FIRST_DRAWING',
 'DAYS_FIRST_DUE',
 'DAYS_LAST_DUE_1ST_VERSION',
 'DAYS_LAST_DUE',
 'DAYS_TERMINATION',
 'AMT_CREDIT_GOODS_PERC',
 'AMT_PAY_YEAR']

In [8]:
num_aggregations = {
'AMT_ANNUITY':['mean','max'],
 'AMT_APPLICATION':['mean','max'],
 'AMT_CREDIT':['mean','max'],
 'AMT_DOWN_PAYMENT':['mean','max'],
 'AMT_GOODS_PRICE':['mean','max'],
 'RATE_DOWN_PAYMENT':['mean'],
 'RATE_INTEREST_PRIMARY':['mean'],
 'RATE_INTEREST_PRIVILEGED':['mean'],
 'DAYS_DECISION':['mean','max','min'],
 'DAYS_FIRST_DRAWING':['mean','max'],
 'DAYS_FIRST_DUE':['mean','max'],
 'DAYS_LAST_DUE_1ST_VERSION':['mean','max'],
 'DAYS_LAST_DUE':['mean','max'],
 'DAYS_TERMINATION':['mean','max'],
 'AMT_CREDIT_GOODS_PERC':['mean','max'],
 'AMT_PAY_YEAR':['mean','max']
}

In [9]:
cat_aggregations = dict(zip(categorical_cols,['mean']*len(categorical_cols)))

In [10]:
aggregations = {**num_aggregations, **cat_aggregations}
prev_app_agg = prev_app.groupby('curr_app_id').agg(aggregations)
prev_app_agg.columns = pd.Index(['PREV_APP_' + col[0] + "_" + col[1].upper() for col in prev_app_agg.columns.tolist()])

#### Previous loans with status: Approved

In [11]:
prev_app_approved = prev_app[prev_app['NAME_CONTRACT_STATUS_Approved'] == 1]
prev_app_approved_agg = prev_app_approved.groupby('curr_app_id').agg(num_aggregations)
prev_app_approved_agg.columns = pd.Index(['PREV_APP_APPROVED_' + col[0] + "_" + col[1].upper() for col in prev_app_approved_agg.columns.tolist()])
prev_app_approved_agg.columns
prev_app_agg = prev_app_agg.join(prev_app_approved_agg, how='left', on='curr_app_id')

#### Previous loans with status: Refused

In [12]:
prev_app_refused = prev_app[prev_app['NAME_CONTRACT_STATUS_Refused'] == 1]
prev_app_refused_agg = prev_app_refused.groupby('curr_app_id').agg(num_aggregations)
prev_app_refused_agg.columns = pd.Index(['PREV_APP_REFUSED_' + col[0] + "_" + col[1].upper() for col in prev_app_refused_agg.columns.tolist()])
prev_app_refused_agg.columns
prev_app_agg = prev_app_agg.join(prev_app_refused_agg, how='left', on='curr_app_id')

In [15]:
prev_app_agg=prev_app_agg.reset_index()
downcast_dtypes(prev_app_agg)
prev_app_agg.info(verbose=True,memory_usage=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338857 entries, 0 to 338856
Data columns (total 250 columns):
 #    Column                                                                     Non-Null Count   Dtype  
---   ------                                                                     --------------   -----  
 0    curr_app_id                                                                338857 non-null  int32  
 1    PREV_APP_AMT_ANNUITY_MEAN                                                  338377 non-null  float32
 2    PREV_APP_AMT_ANNUITY_MAX                                                   338377 non-null  float32
 3    PREV_APP_AMT_APPLICATION_MEAN                                              338857 non-null  float32
 4    PREV_APP_AMT_APPLICATION_MAX                                               338857 non-null  float32
 5    PREV_APP_AMT_CREDIT_MEAN                                                   338857 non-null  float32
 6    PREV_APP_AMT_CREDIT_MAX           

In [16]:
prev_app_agg.to_csv('../staging/previous_application.csv', index=False, compression='zip')