In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)

In [3]:
df = pd.read_csv('/home/work/MLStuding/credit-scoring-ml/data/raw/application_train.csv')

In [4]:
df['age_years'] = -df['DAYS_BIRTH'] / 365.25
df['registration_years'] = -df['DAYS_REGISTRATION'] / 365.25
df['id_published_years'] = -df['DAYS_ID_PUBLISH'] / 365.25

In [5]:
#365243 means NaN in dataset, we add DAYS_EMPLOYED_MISSING as borrowers whith NaN values have lower default rate
days_emp = df['DAYS_EMPLOYED'].replace(365243, np.nan)
df['employed_years'] = -days_emp / 365.25

df['days_employed_missing'] = days_emp.isna().astype(int)
df['employed_years'] = df['employed_years'].fillna(-1)

In [6]:
df['credit_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['AMT_ANNUITY'] = df['AMT_ANNUITY'].fillna(df['AMT_ANNUITY'].median())
df['annuity_income_ratio'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

In [7]:
# smoothing continuous data, reducing peaks
df['log_income'] = np.log1p(df['AMT_INCOME_TOTAL'])
df['log_credit'] = np.log1p(df['AMT_CREDIT'])

Thresholds engineering:

In [8]:
df['has_many_children'] = (df['CNT_CHILDREN'] >= 3).astype(int)

# include after baseline
# df['cnt_children_capped'] = df['CNT_CHILDREN'].clip(upper=3) 

In [9]:
df['ext_source_1_missing'] = df['EXT_SOURCE_1'].isna().astype(int)
df['ext_source_3_missing'] = df['EXT_SOURCE_3'].isna().astype(int)

df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].fillna(-1)

In [10]:
df['flag_gender'] = df['CODE_GENDER'].map({'M': 0, 'F': 1, 'XNA': -1})
df['flag_car'] = df['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})

In [11]:
df['CNT_FAM_MEMBERS'] = df['CNT_FAM_MEMBERS'].fillna(df['CNT_FAM_MEMBERS'].median())

In [15]:
processed = df[[
        'age_years', 'log_income', 'log_credit', 'credit_income_ratio', 'annuity_income_ratio', 'REGION_POPULATION_RELATIVE', 
        'employed_years', 'CNT_FAM_MEMBERS', 'has_many_children', 'REGION_RATING_CLIENT_W_CITY', 'flag_gender', 'flag_car', 
        'registration_years', 'id_published_years', 'EXT_SOURCE_1', 'EXT_SOURCE_2','EXT_SOURCE_3', 'ext_source_1_missing', 'ext_source_3_missing',
        'days_employed_missing', 'TARGET'
]].copy()

processed.to_csv(
    '/home/work/MLStuding/credit-scoring-ml/data/processed/application_features_baseline.csv',
    index=False
    )