In [14]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
import numpy as np

In [15]:
#define utilities functions
def remove_num_outlier(df):
    """
    Replace 365243 with np.nan in date columns of df
    
    """
    for col in df.columns:
        if "DAYS" in col:
            df[col] = df[col].replace({365243: np.nan})

    return df

def remove_cat_outlier(df):
    """
    Replace XNA values in df which represents na in the categorical columns with n/a
    
    """
    categorical_col = [f for f in df.columns if df[f].dtype=="object"]
    for col in categorical_col:
        if df[col][df[col]=='XNA'].shape[0] > 0:
            df[col].replace('XNA', np.nan)
    return df

def downcast_dtypes(df):
    """
    Downsize float64/int64 with float32/int32 to save memory space
    """
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64"]]

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)

    return df

def one_hot_encoder(df, nan_as_category = True):
    """
    One-hot encoding for categorical columns with get_dummies
    """
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def remove_missing_col_with_threshold(df,threshold=0.6):
    """
    remove columns that has missing values more than threshold%
    """
    miss_data = pd.DataFrame((df.isnull().sum())*100/df.shape[0])
    miss_data_col=miss_data[miss_data[0]>threshold].index
    df_new  = df[[i for i in df.columns if i not in miss_data_col]]
    return df_new



In [16]:
#load application train/test data
app_train = pd.read_csv('../input/application_train.zip')
app_test = pd.read_csv('../input/application_test.zip')

#append train and test dataset
df = app_train.append(app_test,sort=False)
train_size = app_train.shape[0]
test_size = app_test.shape[0]

In [17]:
print(train_size)
print(test_size)

287511
20000


In [18]:
df.info(verbose=True,memory_usage=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307511 entries, 0 to 19999
Data columns (total 122 columns):
 #    Column                        Non-Null Count   Dtype  
---   ------                        --------------   -----  
 0    curr_app_id                   307511 non-null  int64  
 1    TARGET                        287511 non-null  float64
 2    NAME_CONTRACT_TYPE            307511 non-null  object 
 3    CODE_GENDER                   307511 non-null  object 
 4    FLAG_OWN_CAR                  307511 non-null  object 
 5    FLAG_OWN_REALTY               307511 non-null  object 
 6    CNT_CHILDREN                  307511 non-null  int64  
 7    AMT_INCOME_TOTAL              307511 non-null  float64
 8    AMT_CREDIT                    307511 non-null  float64
 9    AMT_ANNUITY                   307499 non-null  float64
 10   AMT_GOODS_PRICE               307233 non-null  float64
 11   NAME_TYPE_SUITE               306219 non-null  object 
 12   NAME_INCOME_TYPE             

### Missing Data Imputation 


In [19]:
#replace 365243 with na in numerical columns
remove_num_outlier(df)
#replace XNA with na in the categorical columns
remove_cat_outlier(df)
#remove columns that has more than 60% missing values
#remove_missing_col_with_threshold(df,threshold=0.6)

Unnamed: 0,curr_app_id,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,896318,1.0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,638738,0.0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,566570,0.0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,628165,0.0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,764495,0.0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,948214,,Cash loans,M,N,Y,0,135000.0,253737.0,20047.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
19996,997163,,Cash loans,M,Y,N,1,180000.0,312768.0,20961.0,...,0,0,0,0,0.0,1.0,0.0,0.0,0.0,4.0
19997,728715,,Cash loans,M,Y,Y,1,193500.0,119893.5,12717.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,6.0
19998,512547,,Cash loans,F,N,Y,1,157500.0,779688.0,37638.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


### Feature Engineering

In [21]:
#add LTV, DTI ratios and other useful features
df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
df['INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['AGE_EMPLOYED'] = df['DAYS_EMPLOYED'] - df['DAYS_BIRTH']
df['CNT_OF_ADULT'] = df['CNT_FAM_MEMBERS']-df['CNT_CHILDREN']
df['ADULT_RATIO'] = df['CNT_OF_ADULT']/df['CNT_FAM_MEMBERS']


#further analyze client's income status
cnt_children_med_income = df.groupby('CNT_CHILDREN')['AMT_INCOME_TOTAL'].median()
family_status_med_income = df.groupby('NAME_FAMILY_STATUS')['AMT_INCOME_TOTAL'].median()
own_realty_med_income = df.groupby('FLAG_OWN_REALTY')['AMT_INCOME_TOTAL'].median()
orgnization_med_income = df.groupby('ORGANIZATION_TYPE')['AMT_INCOME_TOTAL'].median()
df['CNT_CHILDREN_MED_INCOME'] = df['CNT_CHILDREN'].map(cnt_children_med_income)
df['FAMILY_STATUS_MED_INCOME'] = df['NAME_FAMILY_STATUS'].map(family_status_med_income)
df['OWN_REALTY_MED_INCOME'] = df['FLAG_OWN_REALTY'].map(own_realty_med_income)
df['ORGANIZATION_MED_INCOME'] = df['ORGANIZATION_TYPE'].map(orgnization_med_income)
df['CNT_CHILDREN_MED_INCOME_RATIO'] = df['AMT_INCOME_TOTAL']/df['CNT_CHILDREN_MED_INCOME']
df['FAMILY_STATUS_MED_INCOME_RATIO'] = df['AMT_INCOME_TOTAL']/df['FAMILY_STATUS_MED_INCOME']
df['OWN_REALTY_MED_INCOME_RATIO'] = df['AMT_INCOME_TOTAL']/df['OWN_REALTY_MED_INCOME']
df['ORGANIZATION_MED_INCOME_RATIO'] = df['AMT_INCOME_TOTAL']/df['ORGANIZATION_MED_INCOME']

stg_cols = ['OWN_REALTY_MED_INCOME',  'CNT_CHILDREN_MED_INCOME', 'FAMILY_STATUS_MED_INCOME', 'ORGANIZATION_MED_INCOME']

#remove staging features
for _ in stg_cols:
    del df[_]

### Correlation Analysis

In [22]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
high_corr_cols = [column for column in upper.columns if any(upper[column] > 0.95)]

col_to_drop = ['AMT_GOODS_PRICE', 'REGION_RATING_CLIENT_W_CITY', 'APARTMENTS_MODE', 
               'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE',
               'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 
               'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 
               'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 
               'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
               'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 
               'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'OBS_60_CNT_SOCIAL_CIRCLE']
# Drop features 
df.drop(col_to_drop, axis=1, inplace=True)

### One-hot encode

In [23]:
df, cat_cols = one_hot_encoder(df, nan_as_category=True)

In [24]:
#downsize float64 and int64 to float32 and int32 to save memory storage
downcast_dtypes(df)

Unnamed: 0,curr_app_id,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,896318,1.0,0,202500.0,406597.5,24700.5,0.018801,-9461,-637.0,-3648.0,...,0,0,0,0,1,0,0,1,0,0
1,638738,0.0,0,270000.0,1293502.5,35698.5,0.003541,-16765,-1188.0,-1186.0,...,0,0,0,0,0,0,0,1,0,0
2,566570,0.0,0,67500.0,135000.0,6750.0,0.010032,-19046,-225.0,-4260.0,...,0,0,0,0,0,0,1,0,0,1
3,628165,0.0,0,135000.0,312682.5,29686.5,0.008019,-19005,-3039.0,-9833.0,...,0,0,0,0,0,0,1,0,0,1
4,764495,0.0,0,121500.0,513000.0,21865.5,0.028663,-19932,-3038.0,-4311.0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,948214,,0,135000.0,253737.0,20047.5,0.030755,-8866,-566.0,-8866.0,...,0,0,0,1,0,0,0,1,0,0
19996,997163,,1,180000.0,312768.0,20961.0,0.018634,-11927,-316.0,-1118.0,...,0,0,0,1,0,0,0,1,0,0
19997,728715,,1,193500.0,119893.5,12717.0,0.018850,-10575,-1794.0,-1637.0,...,0,0,0,0,0,0,1,0,0,1
19998,512547,,1,157500.0,779688.0,37638.0,0.003818,-18150,-1812.0,-11948.0,...,0,0,0,0,0,0,0,1,0,0


In [25]:
df.columns = ['curr_app_id','TARGET']+['APP_' + col for col in df.columns.tolist() if col not in ['curr_app_id','TARGET']]

In [26]:
df.head(5)

Unnamed: 0,curr_app_id,TARGET,APP_CNT_CHILDREN,APP_AMT_INCOME_TOTAL,APP_AMT_CREDIT,APP_AMT_ANNUITY,APP_REGION_POPULATION_RELATIVE,APP_DAYS_BIRTH,APP_DAYS_EMPLOYED,APP_DAYS_REGISTRATION,...,APP_WALLSMATERIAL_MODE_Mixed,APP_WALLSMATERIAL_MODE_Monolithic,APP_WALLSMATERIAL_MODE_Others,APP_WALLSMATERIAL_MODE_Panel,"APP_WALLSMATERIAL_MODE_Stone, brick",APP_WALLSMATERIAL_MODE_Wooden,APP_WALLSMATERIAL_MODE_nan,APP_EMERGENCYSTATE_MODE_No,APP_EMERGENCYSTATE_MODE_Yes,APP_EMERGENCYSTATE_MODE_nan
0,896318,1.0,0,202500.0,406597.5,24700.5,0.018801,-9461,-637.0,-3648.0,...,0,0,0,0,1,0,0,1,0,0
1,638738,0.0,0,270000.0,1293502.5,35698.5,0.003541,-16765,-1188.0,-1186.0,...,0,0,0,0,0,0,0,1,0,0
2,566570,0.0,0,67500.0,135000.0,6750.0,0.010032,-19046,-225.0,-4260.0,...,0,0,0,0,0,0,1,0,0,1
3,628165,0.0,0,135000.0,312682.5,29686.5,0.008019,-19005,-3039.0,-9833.0,...,0,0,0,0,0,0,1,0,0,1
4,764495,0.0,0,121500.0,513000.0,21865.5,0.028663,-19932,-3038.0,-4311.0,...,0,0,0,0,0,0,1,0,0,1


### Save dataset

In [27]:
train = df.iloc[0:train_size,:].copy(deep=True)
test = df.iloc[-test_size:,:].copy(deep=True)


train.to_csv('../staging/application_train.csv', index=False, compression='zip')
test.to_csv('../staging/application_test.csv', index=False, compression='zip')