In [18]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', 100)

PROJECT_ROOT = Path.cwd().parent
RAW_PATH = PROJECT_ROOT / 'data/raw/application_train.csv'
PROCESSED_PATH = PROJECT_ROOT / 'data/processed/application_features_baseline.csv'

In [19]:
if not RAW_PATH.exists():
    raise FileNotFoundError(
        "Raw data not found. See README.md for download insturctions."
    )

df = pd.read_csv(RAW_PATH)

In [20]:
#365243 means NaN in dataset, we add DAYS_EMPLOYED_MISSING as borrowers whith NaN values have lower default rate
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)

df['days_employed_missing'] = df['DAYS_EMPLOYED'].isna().astype(int)
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].fillna(-1)

In [21]:
df['credit_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['annuity_income_ratio'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

In [22]:
# smoothing skewed peaks of data
df['log_income'] = np.log1p(df['AMT_INCOME_TOTAL'])
df['log_credit'] = np.log1p(df['AMT_CREDIT'])

In [23]:
df['has_many_children'] = (df['CNT_CHILDREN'] >= 3).astype(int)

# include after baseline
# df['cnt_children_capped'] = df['CNT_CHILDREN'].clip(upper=3) 

In [24]:
df['ext_source_1_missing'] = df['EXT_SOURCE_1'].isna().astype(int)
df['ext_source_3_missing'] = df['EXT_SOURCE_3'].isna().astype(int)

df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].fillna(-1)

In [25]:
cat_features = df.select_dtypes('object').apply(pd.Series.nunique, axis=0)

for num, name in zip(cat_features, cat_features.index):
    if num <= 2:
        df[name] = df[name].astype('category').cat.codes.replace(-1, np.nan)

df = pd.get_dummies(df)

replacing binary categories to codes (0, 1), all NaN preserved. One-Hot encoding for all categorical features with more than 2 classes

In [26]:
df.drop(columns='SK_ID_CURR', inplace=True)

drop useless features

In [27]:
df.to_csv(
    PROCESSED_PATH,
    index=False
    )