# Appendix 1: Model Selection Process

Explore model based on application features only

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_columns = None
pd.set_option('display.float_format', '{:.4f}'.format)

In [2]:
data_path = 'home-credit-default-risk/'

## Part 1: Feature Engineering

In [3]:
df_app_train = pd.read_csv(data_path + 'application_train.csv')
df_app_test = pd.read_csv(data_path + 'application_test.csv')

In [4]:
## identify fill type based on feature meaning

to_fill_zero_cols = [
    'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'DAYS_LAST_PHONE_CHANGE',
    ## How many observation of client's social surroundings
    'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
    ## Number of enquiries to Credit Bureau about the client
    'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'
]

to_fill_avg_cols = [
    ## Normalized score from external data source
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
    ## Normalized information about building where the client lives
    'APARTMENTS_AVG', 'BASEMENTAREA_AVG',
    'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG',
    'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
    'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG',
    'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE',
    'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE',
    'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE',
    'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',
    'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE',
    'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI',
    'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
    'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI',
    'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI',
    'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE'
]

In [5]:
def gen_feature_dummies(df_raw, to_fill_zero_cols, to_fill_avg_cols):
    df = pd.get_dummies(df_raw, dtype=int)
    df[to_fill_zero_cols] = df_app_train[to_fill_zero_cols].fillna(0)
    
    ## this method might be forward-looking bias for cross valiation, using for exploration only
    df[to_fill_avg_cols] = df_app_train[to_fill_avg_cols].fillna(df_app_train[to_fill_avg_cols].mean())
    df.columns = df.columns.str.upper().str.replace('[^A-Za-z0-9_]', '_', regex=True)
    return df

In [6]:
df_train = gen_feature_dummies(df_app_train, to_fill_zero_cols, to_fill_avg_cols)
df_test = gen_feature_dummies(df_app_test, to_fill_zero_cols, to_fill_avg_cols)

## Part 2: Model Selection

In [12]:
def prepare_train_test_set(df_train, df_test):
    if 'SK_ID_CURR' in df_train.columns:
        X_train = df_train.drop(['SK_ID_CURR'], axis=1)
        X_test = df_test.drop(['SK_ID_CURR'], axis=1)
    else:
        X_train = df_train.copy()
        X_test = df_test.copy()
    y_train = df_app_train['TARGET']

    ## to handle missing dummy columns 
    to_add_cols = [c for c in X_train.columns if c not in X_test.columns]
    for c in to_add_cols:
        X_test[c] = 0
    X_test = X_test[X_train.columns]
    return X_train, y_train, X_test

def collect_model_result(oof_preds, auc_score, model_name):
    df_pred[model_name] = oof_preds
    dict_auc[model_name] = auc_score

In [8]:
df_pred = df_train[['TARGET']].copy()
dict_auc = dict()

### 2.1) LogisticRegression

In [9]:
X_train, y_train, X_test = prepare_train_test_set(df_train.drop(['TARGET'], axis=1), df_test)
X_train.shape, X_test.shape

((307511, 244), (48744, 244))

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

for train_idx, val_idx in folds.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = make_pipeline(
        StandardScaler(), 
        LogisticRegression(max_iter=1000)
    )
    model.fit(X_tr, y_tr)

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / folds.n_splits

print('CV AUC:', roc_auc_score(y_train, oof_preds))

CV AUC: 0.7467989136495174


In [13]:
collect_model_result(oof_preds, roc_auc_score(y_train, oof_preds), 'logisticregression')

In [16]:
coef = pd.Series(model.named_steps['logisticregression'].coef_[0], index=X_train.columns).sort_values(ascending=False)
coef

DAYS_EMPLOYED                               5.4955
AMT_CREDIT                                  0.9682
NAME_INCOME_TYPE_WORKING                    0.8490
NAME_INCOME_TYPE_COMMERCIAL_ASSOCIATE       0.6809
ORGANIZATION_TYPE_BUSINESS_ENTITY_TYPE_3    0.4677
                                             ...  
EXT_SOURCE_2                               -0.3916
EXT_SOURCE_3                               -0.4826
AMT_GOODS_PRICE                            -1.0650
ORGANIZATION_TYPE_XNA                      -2.0765
NAME_INCOME_TYPE_PENSIONER                 -2.1200
Length: 244, dtype: float64

In [17]:
selected_feature = coef[coef.abs() > 0.1].index
len(selected_feature)

47

In [18]:
X_train, y_train, X_test = prepare_train_test_set(df_train[selected_feature], df_test[selected_feature])

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

for train_idx, val_idx in folds.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = make_pipeline(
        StandardScaler(), 
        LogisticRegression(max_iter=1000)
    )
    model.fit(X_tr, y_tr)

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / folds.n_splits

print('CV AUC:', roc_auc_score(y_train, oof_preds))

CV AUC: 0.734724215515129


In [19]:
collect_model_result(oof_preds, roc_auc_score(y_train, oof_preds), 'logisticregression_top')

In [29]:
coef = pd.Series(model.named_steps['logisticregression'].coef_[0], index=X_train.columns).sort_values(ascending=False)
coef

DAYS_EMPLOYED                               6.2319
FLAG_EMP_PHONE                              1.1220
AMT_CREDIT                                  1.0260
NAME_INCOME_TYPE_Working                    0.7257
NAME_INCOME_TYPE_Commercial associate       0.5446
YEARS_BUILD_AVG                             0.3140
NAME_INCOME_TYPE_State servant              0.3070
ENTRANCES_AVG                               0.2242
BASEMENTAREA_MEDI                           0.1887
AMT_ANNUITY                                 0.1476
OBS_30_CNT_SOCIAL_CIRCLE                    0.1145
FLAG_DOCUMENT_3                             0.1026
ORGANIZATION_TYPE_Self-employed             0.0959
ORGANIZATION_TYPE_Business Entity Type 3    0.0858
REGION_RATING_CLIENT_W_CITY                 0.0741
LIVINGAPARTMENTS_MEDI                       0.0642
ORGANIZATION_TYPE_Construction              0.0620
ORGANIZATION_TYPE_Transport: type 3         0.0517
NAME_TYPE_SUITE_Unaccompanied               0.0375
ORGANIZATION_TYPE_Business Enti

### 2.2) XGBClassifier

In [20]:
X_train, y_train, X_test = prepare_train_test_set(df_train.drop(['TARGET'], axis=1), df_test)
X_train.shape, X_test.shape

((307511, 244), (48744, 244))

In [21]:
from xgboost import XGBClassifier

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

for train_idx, val_idx in folds.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = XGBClassifier(eval_metric=roc_auc_score, random_state=123)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / folds.n_splits

print('CV AUC:', roc_auc_score(y_train, oof_preds))

[0]	validation_0-logloss:0.28725	validation_0-roc_auc_score:0.70859
[1]	validation_0-logloss:0.27489	validation_0-roc_auc_score:0.71850
[2]	validation_0-logloss:0.26710	validation_0-roc_auc_score:0.72305
[3]	validation_0-logloss:0.26196	validation_0-roc_auc_score:0.72566
[4]	validation_0-logloss:0.25857	validation_0-roc_auc_score:0.72824
[5]	validation_0-logloss:0.25632	validation_0-roc_auc_score:0.73002
[6]	validation_0-logloss:0.25450	validation_0-roc_auc_score:0.73393
[7]	validation_0-logloss:0.25354	validation_0-roc_auc_score:0.73488
[8]	validation_0-logloss:0.25266	validation_0-roc_auc_score:0.73753
[9]	validation_0-logloss:0.25198	validation_0-roc_auc_score:0.73961
[10]	validation_0-logloss:0.25133	validation_0-roc_auc_score:0.74135
[11]	validation_0-logloss:0.25084	validation_0-roc_auc_score:0.74262
[12]	validation_0-logloss:0.25050	validation_0-roc_auc_score:0.74347
[13]	validation_0-logloss:0.25015	validation_0-roc_auc_score:0.74474
[14]	validation_0-logloss:0.25001	validation

In [22]:
collect_model_result(oof_preds, roc_auc_score(y_train, oof_preds), 'xgb')

In [24]:
import shap
explainer = shap.Explainer(model)
shap_values = explainer(X_train)

In [37]:
shap_values.values.shape

(307511, 244)

In [40]:
shap_importance = np.abs(shap_values.values).mean(axis=0)
features = pd.Series(shap_importance, index=X_train.columns).sort_values(ascending=False)
top_features = features[features > 0.01]
print('# Top features:', len(top_features))
print(top_features.head(50))

# Top features: 51
EXT_SOURCE_3                                        0.3841
EXT_SOURCE_2                                        0.3450
AMT_GOODS_PRICE                                     0.2528
AMT_CREDIT                                          0.2232
EXT_SOURCE_1                                        0.1585
DAYS_EMPLOYED                                       0.1092
DAYS_BIRTH                                          0.1001
AMT_ANNUITY                                         0.0975
CODE_GENDER_F                                       0.0825
FLAG_OWN_CAR_N                                      0.0800
NAME_EDUCATION_TYPE_Higher education                0.0779
DAYS_ID_PUBLISH                                     0.0754
DAYS_LAST_PHONE_CHANGE                              0.0592
NAME_FAMILY_STATUS_Married                          0.0546
FLAG_DOCUMENT_3                                     0.0486
NAME_INCOME_TYPE_Working                            0.0421
DAYS_REGISTRATION                    

### 2.3) LGBMClassifier

In [25]:
import shap

def cal_shap(model, X):
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    return shap_values

def cal_top_features(shap_values, X):
    shap_importance = np.abs(shap_values[:, :, 1].values).mean(axis=0)
    features = pd.Series(shap_importance, index=X.columns).sort_values(ascending=False)
    top_features = features[features > 0.01]
    print('# Top features:', len(top_features))
    print(top_features.head(50))
    return features

def get_shap_features(model, X):
    shap_values = cal_shap(model, X)
    features = cal_top_features(shap_values, X)
    return shap_values, features

In [26]:
X_train, y_train, X_test = prepare_train_test_set(df_train.drop(['TARGET'], axis=1), df_test)
X_train.shape, X_test.shape

((307511, 244), (48744, 244))

In [27]:
import lightgbm as lgb

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

for train_idx, val_idx in folds.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, random_state=123, force_col_wise=True)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='auc')

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / folds.n_splits

print('CV AUC:', roc_auc_score(y_train, oof_preds))

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Total Bins 11410
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 234
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Total Bins 11334
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 234
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Total Bins 11347
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 233
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2

In [28]:
collect_model_result(oof_preds, roc_auc_score(y_train, oof_preds), 'lgb')

In [29]:
shap_values, features = get_shap_features(model, X_train)

# Top features: 45
EXT_SOURCE_3                                        0.3810
EXT_SOURCE_2                                        0.3332
AMT_GOODS_PRICE                                     0.2164
AMT_CREDIT                                          0.1937
EXT_SOURCE_1                                        0.1610
DAYS_EMPLOYED                                       0.1018
DAYS_BIRTH                                          0.0854
CODE_GENDER_F                                       0.0824
NAME_EDUCATION_TYPE_HIGHER_EDUCATION                0.0822
FLAG_OWN_CAR_N                                      0.0778
AMT_ANNUITY                                         0.0778
DAYS_ID_PUBLISH                                     0.0719
NAME_FAMILY_STATUS_MARRIED                          0.0517
NAME_CONTRACT_TYPE_CASH_LOANS                       0.0466
DAYS_LAST_PHONE_CHANGE                              0.0464
FLAG_DOCUMENT_3                                     0.0402
NAME_INCOME_TYPE_WORKING             

### 2.4) LGBMClassifier with category-type features

In [30]:
def gen_feature_lgb(df_raw, to_fill_zero_cols, to_fill_avg_cols):
    df_object = df_raw.select_dtypes(include='object')
    
    df = df_raw.copy()
    df[df_object.columns] = df_object.astype('category')
    df[to_fill_zero_cols] = df_app_train[to_fill_zero_cols].fillna(0)
    
    ## this method might be forward-looking bias for cross valiation, using for exploration only
    df[to_fill_avg_cols] = df_app_train[to_fill_avg_cols].fillna(df_app_train[to_fill_avg_cols].mean())
    return df

In [31]:
df_train = gen_feature_lgb(df_app_train, to_fill_zero_cols, to_fill_avg_cols)
df_test = gen_feature_lgb(df_app_test, to_fill_zero_cols, to_fill_avg_cols)

In [32]:
X_train, y_train, X_test = prepare_train_test_set(df_train.drop(['TARGET'], axis=1), df_test)
X_train.shape, X_test.shape

((307511, 120), (48744, 120))

In [33]:
import lightgbm as lgb

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
oof_preds = np.zeros(X_train.shape[0])
test_preds = np.zeros(X_test.shape[0])

for train_idx, val_idx in folds.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, random_state=123, force_col_wise=True)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='auc')

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / folds.n_splits

print('CV AUC:', roc_auc_score(y_train, oof_preds))

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Total Bins 11292
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Total Bins 11214
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Total Bins 11231
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2

In [34]:
collect_model_result(oof_preds, roc_auc_score(y_train, oof_preds), 'lgb_category')

In [35]:
shap_values, features = get_shap_features(model, X_train)

# Top features: 42
EXT_SOURCE_3                   0.3784
EXT_SOURCE_2                   0.3346
AMT_GOODS_PRICE                0.2099
AMT_CREDIT                     0.1835
EXT_SOURCE_1                   0.1595
DAYS_EMPLOYED                  0.1241
ORGANIZATION_TYPE              0.1148
CODE_GENDER                    0.0985
NAME_EDUCATION_TYPE            0.0848
AMT_ANNUITY                    0.0826
DAYS_BIRTH                     0.0799
DAYS_ID_PUBLISH                0.0742
FLAG_OWN_CAR                   0.0739
OCCUPATION_TYPE                0.0602
NAME_FAMILY_STATUS             0.0516
NAME_INCOME_TYPE               0.0514
DAYS_LAST_PHONE_CHANGE         0.0465
FLAG_DOCUMENT_3                0.0435
NAME_CONTRACT_TYPE             0.0402
AMT_REQ_CREDIT_BUREAU_QRT      0.0346
WEEKDAY_APPR_PROCESS_START     0.0333
REGION_POPULATION_RELATIVE     0.0314
REGION_RATING_CLIENT_W_CITY    0.0313
DAYS_REGISTRATION              0.0296
OWN_CAR_AGE                    0.0296
FLAG_WORK_PHONE                

## Part 3: Model Comparison

In [14]:
dict_auc

{'logisticregression': 0.7467989136495174}

In [15]:
df_pred

Unnamed: 0,TARGET,logisticregression
0,1,0.6105
1,0,0.0319
2,0,0.0317
3,0,0.0691
4,0,0.0912
...,...,...
307506,0,0.0851
307507,0,0.1180
307508,0,0.0543
307509,1,0.0577
