In [15]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', 100)

PROJECT_ROOT = Path.cwd().parent

APPLIC_PATH = PROJECT_ROOT / 'data/raw/application_train.csv'
PROCESSED_PATH = PROJECT_ROOT / 'data/processed/application_features_baseline.csv'
BUREAU_PATH = PROJECT_ROOT / 'data/raw/bureau.csv'
BUREAU_BALANCE_PATH = PROJECT_ROOT / 'data/raw/bureau_balance.csv'

INTERIM_PATH = PROJECT_ROOT / 'data/interim'
INTERIM_BUREAU = INTERIM_PATH / 'bureau_agg.csv'

In [2]:
if not APPLIC_PATH.exists() or not BUREAU_PATH.exists() or not BUREAU_BALANCE_PATH.exists():
    raise FileNotFoundError(
        "Raw data not found. See README.md for download insturctions."
    )

df = pd.read_csv(APPLIC_PATH)

bureau_df = pd.read_csv(BUREAU_PATH)
bureau_balance_df = pd.read_csv(BUREAU_BALANCE_PATH)


In [3]:
bureau_balance_agg = bureau_balance_df.groupby('SK_ID_BUREAU').agg(
    loan_duration=('MONTHS_BALANCE', 'count'),
    prepaid_ratio=('STATUS', lambda x: (x == 'C').mean()),
    default_ever=('STATUS', lambda x: (x == '5').max()),
    bad_dpd_ratio=('STATUS', lambda x: (x.isin(['2', '3', '4', '5']).mean())),
    bad_dpd_count=('STATUS', lambda x: (x.isin(['2', '3', '4', '5']).sum())),
    small_dpd_ratio=('STATUS', lambda x: (x == '1').mean()),
    small_dpd_count=('STATUS', lambda x: (x == '1').sum()),
    paid_in_time_ratio=('STATUS', lambda x: (x == '0').mean()),
    paid_in_time_count=('STATUS', lambda x: (x == '0').sum()),
    unknown_ratio=('STATUS', lambda x: (x == 'X').mean())
).reset_index()

In [4]:
bureau_df = bureau_df.merge(
    bureau_balance_agg, 
    how='left',
    on='SK_ID_BUREAU',
    )

In [None]:
bureau_df['early_closure_days'] = bureau_df['DAYS_CREDIT_ENDDATE'] - bureau_df['DAYS_ENDDATE_FACT']
bureau_df['credit_duration'] = abs(bureau_df['DAYS_CREDIT'] - bureau_df['DAYS_ENDDATE_FACT'])

bureau_df['cnt_curent_overdue'] = (
    (bureau_df['CREDIT_ACTIVE'] == 'Active') & 
    (bureau_df['CREDIT_DAY_OVERDUE'] > 0)
).astype(int)

bureau_df['overdue_days_active'] = np.where(
    bureau_df['CREDIT_ACTIVE'] == 'Active', 
    bureau_df['CREDIT_DAY_OVERDUE'],
    0
)

bureau_df['ever_overdue_flag'] = (bureau_df['AMT_CREDIT_MAX_OVERDUE'] > 0).astype(int)

bureau_df['overdue_ratio'] = np.where(
    bureau_df['AMT_CREDIT_SUM_OVERDUE'] > 0,
    bureau_df['AMT_CREDIT_SUM_OVERDUE'] / bureau_df['AMT_CREDIT_SUM'],
    0
)

bureau_df['credit_sum_active'] = np.where(
    bureau_df['CREDIT_ACTIVE'] == 'Active',
    bureau_df['AMT_CREDIT_SUM'],
    0
)

bureau_df['annuity_active'] = np.where(
    bureau_df['CREDIT_ACTIVE'] == 'Active',
    bureau_df['AMT_ANNUITY'],
    0
)

In [8]:
bureau_agg = bureau_df.groupby('SK_ID_CURR').agg(
    sold_times=('CREDIT_ACTIVE', lambda x: x.isin(['Sold', 'Bad debt']).sum()),
    closed_ratio=('CREDIT_ACTIVE', lambda x: (x == 'Closed').mean()),
    active_credits=('CREDIT_ACTIVE', lambda x: (x == 'Active').sum()),
    first_credit_time=('DAYS_CREDIT', 'min'),
    overdue_days_mean=('CREDIT_DAY_OVERDUE', 'mean'),
    overdue_days_active_mean=('overdue_days_active', 'mean'),
    overdue_active_max=('CREDIT_DAY_OVERDUE', 'max'),
    overdue_historical_max=('AMT_CREDIT_MAX_OVERDUE', 'max'),
    loans_ever_overdue=('ever_overdue_flag', 'sum'),
    loans_overdue_ratio=('ever_overdue_flag', 'mean'),
    overdue_credits_active=('cnt_curent_overdue', 'sum'),
    overdue_ammount_active=('AMT_CREDIT_SUM_OVERDUE', 'sum'),
    overdue_ratio_max=('overdue_ratio', 'max'),
    early_closure_days_ratio=('early_closure_days', 'mean'),
    credit_duration_mean=('credit_duration', 'mean'),
    prolonged_max=('CNT_CREDIT_PROLONG', 'max'),
    prolonged_times=('CNT_CREDIT_PROLONG', 'sum'),
    credit_sum_mean=('AMT_CREDIT_SUM', 'mean'),
    active_credit_sum=('credit_sum_active', 'sum'),
    debt_max=('AMT_CREDIT_SUM_DEBT', 'max'),
    debt_mean=('AMT_CREDIT_SUM_DEBT', 'mean'),
    credit_limit_max=('AMT_CREDIT_SUM_LIMIT', 'max'),
    has_credit_card=('CREDIT_TYPE', lambda x: (x == 'Credit card').max()),
    credit_card_cnt=('CREDIT_TYPE', lambda x: (x == 'Credit card').sum()),
    low_risk_loans=('CREDIT_TYPE', lambda x: x.isin(['Car loan', 'Mortgage', 'Loan for business development']).sum()),
    has_microloan=('CREDIT_TYPE', lambda x: (x == 'Microloan').max()),
    consumer_credit_sum=('CREDIT_TYPE', lambda x: (x == 'Consumer credit').sum()),
    last_credit_update=('DAYS_CREDIT_UPDATE', 'max'),
    first_credit_update=('DAYS_CREDIT_UPDATE', 'min'),
    current_annuity=('annuity_active', 'sum'),
    annuity_mean=('AMT_ANNUITY', 'mean'),
    loan_duration_avg=('loan_duration', 'mean'),
    loan_duration_max=('loan_duration', 'max'),
    prepaid_ratio_avg=('prepaid_ratio', 'mean'),
    defaults=('default_ever', 'sum'),
    worst_dpd=('bad_dpd_ratio', 'max'),
    bad_dpd_avg=('bad_dpd_ratio', 'mean'),
    bad_dpd_cnt=('bad_dpd_count', 'sum'),
    bad_dpd_times_max=('bad_dpd_count', 'max'),
    bigest_small_dpd=('small_dpd_ratio', 'max'),
    small_dpd_avg=('small_dpd_ratio', 'mean'),
    small_dpd_cnt=('small_dpd_count', 'sum'),
    paid_in_time_avg=('paid_in_time_ratio', 'mean'),
    paid_in_time_cnt=('paid_in_time_count', 'sum'),
    unkown_ratio_avg=('unknown_ratio', 'mean')
).reset_index()

In [16]:
bureau_agg.to_csv(
    INTERIM_BUREAU,
    index=False
)

In [9]:
df = df.merge(
    bureau_agg,
    how='left',
    on='SK_ID_CURR'
)

In [10]:
df['debt_ratio_mean'] = np.where(
   df['credit_sum_mean'] > 0,
   df['debt_mean'] / df['credit_sum_mean'],
   0
)

In [3]:
#365243 means NaN in dataset, we add DAYS_EMPLOYED_MISSING as borrowers whith NaN values have lower default rate
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)

df['days_employed_missing'] = df['DAYS_EMPLOYED'].isna().astype(int)
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].fillna(-1)

In [4]:
df['credit_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['annuity_income_ratio'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

In [5]:
# smoothing skewed peaks of data
df['log_income'] = np.log1p(df['AMT_INCOME_TOTAL'])
df['log_credit'] = np.log1p(df['AMT_CREDIT'])

In [6]:
df['has_many_children'] = (df['CNT_CHILDREN'] >= 3).astype(int)

# include after baseline
# df['cnt_children_capped'] = df['CNT_CHILDREN'].clip(upper=3) 

In [7]:
df['ext_source_1_missing'] = df['EXT_SOURCE_1'].isna().astype(int)
df['ext_source_3_missing'] = df['EXT_SOURCE_3'].isna().astype(int)

df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].fillna(-1)

In [None]:
bereau = pd.read_csv(BUREAU_PATH)

df.merge(bereau, )

In [8]:
cat_features = df.select_dtypes('object').apply(pd.Series.nunique, axis=0)

for num, name in zip(cat_features, cat_features.index):
    if num <= 2:
        df[name] = df[name].astype('category').cat.codes.replace(-1, np.nan)

df = pd.get_dummies(df)

replacing binary categories to codes (0, 1), all NaN preserved. One-Hot encoding for all categorical features with more than 2 classes

In [9]:
df.drop(columns='SK_ID_CURR', inplace=True)

drop useless features

In [10]:
# df.to_csv(
#     PROCESSED_PATH,
#     index=False
#     )

In [11]:
df.describe()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,...,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,days_employed_missing,credit_income_ratio,annuity_income_ratio,log_income,log_credit,has_many_children,ext_source_1_missing,ext_source_3_missing
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,307511.0,307511.0,104582.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307509.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,151450.0,127568.0,157504.0,103023.0,92646.0,143620.0,152683.0,154491.0,98869.0,124921.0,97312.0,153161.0,93997.0,137829.0,151450.0,127568.0,...,143620.0,152683.0,154491.0,98869.0,124921.0,97312.0,153161.0,93997.0,137829.0,159080.0,161756.0,306490.0,306490.0,306490.0,306490.0,307510.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0,307511.0,307511.0,307499.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,0.080729,0.095213,0.340108,0.693673,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,-1955.028194,-4986.120328,-2994.202373,12.061091,0.999997,0.819889,0.199368,0.998133,0.281066,0.05672,2.152665,2.052463,2.031521,12.063419,0.015144,0.050769,0.040659,0.078173,0.230454,0.179555,-0.344787,0.511142,0.211322,0.11744,0.088442,0.977735,0.752471,0.044621,0.078942,0.149725,0.226282,0.231894,0.066333,0.100775,0.107399,0.008809,0.028358,0.114231,0.087543,...,0.078078,0.149213,0.225897,0.231625,0.067169,0.101954,0.108607,0.008651,0.028236,0.102547,0.014392,1.422245,0.143421,1.405292,0.100049,-962.858788,4.2e-05,0.710023,8.1e-05,0.015115,0.088055,0.000192,0.081376,0.003896,2.3e-05,0.003912,7e-06,0.003525,0.002936,0.00121,0.009928,0.000267,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974,0.180072,3.95757,0.18093,11.909245,13.070108,0.013892,0.563811,0.198253
std,0.272419,0.293509,0.473746,0.460968,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,2306.916343,3522.886321,1509.450419,11.944812,0.001803,0.38428,0.399526,0.043164,0.449521,0.231307,0.910682,0.509034,0.502737,3.265832,0.122126,0.219526,0.197499,0.268444,0.421124,0.383817,0.757855,0.203316,0.627111,0.10824,0.082438,0.059223,0.11328,0.076036,0.134576,0.100049,0.144641,0.16138,0.081184,0.092576,0.110565,0.047732,0.069523,0.107936,0.084307,...,0.134467,0.100368,0.145067,0.161934,0.082167,0.093642,0.11226,0.047415,0.070166,0.107462,0.119101,2.400989,0.446698,2.379803,0.362291,826.808487,0.006502,0.453752,0.009016,0.12201,0.283376,0.01385,0.273412,0.062295,0.004771,0.062424,0.00255,0.059268,0.05411,0.03476,0.099144,0.016327,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295,0.384248,2.689728,0.094574,0.488906,0.715193,0.117044,0.495912,0.398684
min,0.0,0.0,0.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,-24672.0,-7197.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004808,0.000224,10.152338,10.71444,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,-7479.5,-4299.0,5.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.390782,0.17876,0.0577,0.0442,0.9767,0.6872,0.0078,0.0,0.069,0.1667,0.0833,0.0187,0.0504,0.0453,0.0,0.0,0.0525,0.0407,...,0.0,0.069,0.1667,0.0833,0.0187,0.0513,0.0457,0.0,0.0,0.0412,0.0,0.0,0.0,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.018667,0.114782,11.630717,12.506181,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,-4504.0,-3254.0,9.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.565467,0.45969,0.0876,0.0763,0.9816,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0481,0.0756,0.0745,0.0,0.0036,0.084,0.0746,...,0.0,0.1379,0.1667,0.2083,0.0487,0.0761,0.0749,0.0,0.0031,0.0688,0.0,0.0,0.0,0.0,0.0,-757.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.265067,0.162833,11.899215,13.149068,0.0,1.0,0.0
75%,0.0,0.0,1.0,1.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,-2010.0,-1720.0,15.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,2.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.456261,0.663422,0.636376,0.1485,0.1122,0.9866,0.8232,0.0515,0.12,0.2069,0.3333,0.375,0.0856,0.121,0.1299,0.0039,0.0277,0.1439,0.1124,...,0.12,0.2069,0.3333,0.375,0.0868,0.1231,0.1303,0.0039,0.0266,0.1276,0.0,2.0,0.0,2.0,0.0,-274.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.15988,0.229067,12.2185,13.603123,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,0.0,0.0,0.0,91.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,3.0,3.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0,0.962693,0.855,0.89601,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,348.0,34.0,344.0,24.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0,1.0,84.736842,1.875965,18.577685,15.214228,1.0,1.0,1.0
