# Load data and library

In [1]:
# import basic library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
pd.options.display.max_columns = None
plt.rcParams.update(plt.rcParamsDefault)
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
import sys
import ast
def check_memory_usage(var):
    print('%s KB'%(sys.getsizeof(var)/1000))

In [2]:
%%time
# load the final train and test dataset
app_train = pd.read_csv('final train and test dataset/app_train_final.csv')
app_test = pd.read_csv('final train and test dataset/app_test_final.csv')

# load the column names
column_names = pd.read_csv('final train and test dataset/column_names.csv')

num_columns = ast.literal_eval(column_names[column_names['variable'] == 'num_columns']['list'].tolist()[0])
cat_columns = ast.literal_eval(column_names[column_names['variable'] == 'cat_columns']['list'].tolist()[0])
nom_columns = ast.literal_eval(column_names[column_names['variable'] == 'nom_columns']['list'].tolist()[0])
ord_columns = ast.literal_eval(column_names[column_names['variable'] == 'ord_columns']['list'].tolist()[0])
poly_columns = ast.literal_eval(column_names[column_names['variable'] == 'poly_columns']['list'].tolist()[0])
creation_columns = ast.literal_eval(column_names[column_names['variable'] == 'creation_columns']['list'].tolist()[0])
bureau_app_columns = ast.literal_eval(column_names[column_names['variable'] == 'bureau_app_columns']['list'].tolist()[0])
previous_app_columns = ast.literal_eval(column_names[column_names['variable'] == 'previous_app_columns']['list'].tolist()[0])

Wall time: 27.1 s


In [3]:
print('app_train shape:', app_train.shape)
print('app_test shape:', app_test.shape)

app_train shape: (307511, 709)
app_test shape: (48744, 708)


In [4]:
app_train['CODE_GENDER'] = pd.Categorical(app_train['CODE_GENDER'], ordered = True, categories = ['F', 'M'])
app_test['CODE_GENDER'] = pd.Categorical(app_test['CODE_GENDER'], ordered = True, categories = ['F', 'M'])

app_train['FLAG_OWN_CAR'] = pd.Categorical(app_train['FLAG_OWN_CAR'], ordered = True, categories = ['N', 'Y'])
app_test['FLAG_OWN_CAR'] = pd.Categorical(app_test['FLAG_OWN_CAR'], ordered = True, categories = ['N', 'Y'])

app_train['FLAG_OWN_REALTY'] = pd.Categorical(app_train['FLAG_OWN_REALTY'], ordered = True, categories = ['N', 'Y'])
app_test['FLAG_OWN_REALTY'] = pd.Categorical(app_test['FLAG_OWN_REALTY'], ordered = True, categories = ['N', 'Y'])

app_train['YEARS_BIRTH_SEGMENT'] = pd.Categorical(app_train['YEARS_BIRTH_SEGMENT'], ordered = True)
app_test['YEARS_BIRTH_SEGMENT'] = pd.Categorical(app_test['YEARS_BIRTH_SEGMENT'], ordered = True)

# Modeling

## Baseline score

The evaluation metric score to use for this project is `ROC AUC` score since the class is imbalance. Also this dataset is a part of [Kaggle Competition](https://www.kaggle.com/competitions/home-credit-default-risk/overview) which ask us to use `ROC AUC score` as the evaluation metric score. As there aren't any `TARGET` variable in `app_test`, we will have to create and submit the submission file which contains the `SK_ID_CURR` and `TARGET` (in the form of predicted probabilities) in order to evaluate the `app_test` using `ROC AUC`. The goal here is to get `ROC AUC` score higher than `0.5` or we can call this as random guessing model with 50% probability.

## Data preprocessing

Before we start the modeling process, we need to preprocess the data first. Here are some crucial preprocessing steps we need to conduct:
1. Impute missing values using median/mean. (only for Logistic Regression)
    - Using `SimpleImputer` from `sklearn`.
2. Scale the values of numerical features using standardization or normalization. (only for Logistic Regression)
    - Using `StandardScaler` or `MinMaxScaler`  from `sklearn`.
3. Encode the values of categorical features using one-hot-encoding for nominal unique values and ordinal encoding for ordinal unique values.
    - Using `OneHotEncoder` or `OrdinalEncoder` from `sklearn`.

All those preprocessing steps will be handled implicitly in the `Pipeline` object from `sklearn`.

## Build, train, and evaluate model with no hyperparameter tuning

Two machine learning algorithms are used:
- Logistic Regression (no hyperparameter tuning).
- LightGBM (no hyperparameter tuning). 

Below are to sum up the features names:
- `num_columns`: contains numerical features from original source `application_train.csv`/`application_test.csv` after feature selection.
- `cat_columns`: contains categorical features from original source `application_train.csv`/`application_test.csv` after feature selection in which divided into: 
    - `nom_columns`: contains nominal features.
    - `ord_columns`: contains ordinal features.
- `poly_columns`: contains polynomial features from `EXT_SOURCE_1`, `EXT_SOURCE_2`, and `EXT_SOURCE_3` features with degrees of 3.
- `creation_columns`: contains domain knowledge features engineered from `application_train.csv`/`application_test.csv`.
- `bureau_app_columns`: contains aggregated features from `bureau.csv` and `bureau_balance.csv`.
- `previous_app_columns`: contains aggregated features from `previous_application.csv`, `credit_card_balance.csv`, `installments_payments.csv`, and `POS_CASH_balance.csv`.

The code below are some functions to create feature `Pipeline` of data preprocessing step and other helpful function.

In [5]:
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, RandomizedSearchCV
from lightgbm import plot_importance

# helper function to get the feature preprocessing pipeline
def get_feature_pipeline(numerical, nominal, ordinal, algorithm):
    preprocess_numerical = FunctionTransformer(lambda x: x[numerical], validate = False)
    preprocess_nominal = FunctionTransformer(lambda x: x[nominal], validate = False)
    preprocess_ordinal = FunctionTransformer(lambda x: x[ordinal], validate = False)
    if algorithm == 'Logistic Regression':
        pl_numerical = Pipeline([('selector_numerical', preprocess_numerical),
                                 ('imputer', SimpleImputer(strategy = 'median')),
                                 ('scaler', MinMaxScaler())])                    
        pl_nominal = Pipeline([('selector_nominal', preprocess_nominal),
                               ('imputer', SimpleImputer(strategy = 'most_frequent')),
                               ('encoder', OneHotEncoder())])
        pl_ordinal = Pipeline([('selector_ordinal', preprocess_ordinal),
                               ('imputer', SimpleImputer(strategy = 'most_frequent')),
                               ('encoder', OrdinalEncoder())])
        feature_pipeline = FeatureUnion([('pipeline_numerical', pl_numerical),
                                         ('pipeline_nominal', pl_nominal),
                                         ('pipeline_ordinal', pl_ordinal)])
        return feature_pipeline
    
    elif algorithm == 'GBM':
        pl_numerical = Pipeline([('selector_numerical', preprocess_numerical)])
        pl_nominal = Pipeline([('selector_nominal', preprocess_nominal),
                               ('encoder', OneHotEncoder())])
        pl_ordinal = Pipeline([('selector_ordinal', preprocess_ordinal),
                               ('encoder', OrdinalEncoder())])
        feature_pipeline = FeatureUnion([('pipeline_numerical', pl_numerical),
                                         ('pipeline_nominal', pl_nominal),
                                         ('pipeline_ordinal', pl_ordinal)])
        return feature_pipeline
    else:
        print('algorithm argument is wrong. Try "Logistic Regression" or "GBM"!')
        return None

# helper function to evaluate the model performance
def model_evaluation(model, tuning, train, test, name):
    
    if tuning == True:
        CV_score = model.best_score_
        print('best CV ROC AUC scores:', round(CV_score, 3))
    
    # get the X feature matrix and y target vector of app_train and app_test
    X_train = train.drop(['TARGET'], axis = 1); y_train = train[['TARGET']]
    X_test = test
    
    # predict the model
    y_pred_train = model.predict(X_train)
    y_pred_proba_train = model.predict_proba(X_train)[:, 1]
    y_pred_proba_test = model.predict_proba(X_test)[:, 1]
    
    # training model performance
    accuracy_train = metrics.accuracy_score(y_train, y_pred_train)
    roc_auc_train = metrics.roc_auc_score(y_train, y_pred_proba_train)
    roc_auc_cv = cross_validate(estimator = model, X = X_train, y = y_train, cv = 5, scoring = 'roc_auc', return_train_score = True)
    dict_model_performance = {'Accuracy train':accuracy_train.round(3),
                              'ROC AUC 5-CV train':'{} ± {}'.format(roc_auc_cv['train_score'].mean().round(3), roc_auc_cv['train_score'].std().round(3)),
                              'ROC AUC 5-CV validate':'{} ± {}'.format(roc_auc_cv['test_score'].mean().round(3), roc_auc_cv['test_score'].std().round(3)), 
                              'ROC AUC train':roc_auc_train.round(3)}
    print('\n')
    print ('======== model evaluation metrics "{}" ========'.format(name))
    print('confusion matrix and classification report "app_train": \n{0}\n{1}'.format(metrics.confusion_matrix(y_train, y_pred_train), 
                                                                                      metrics.classification_report(y_train, y_pred_train, target_names = ['repaid', 'not repaid'])))
    df = pd.DataFrame([dict_model_performance])
    df.rename(index = {0:name}, inplace = True)
    return df, y_pred_proba_test

def feature_importance(model,plot = True, max_num_features = 15, figsize = (6, 4)):
    numerical = list(model[0].transformer_list[0][1].steps[0][1].transform(app_train).columns)
    ordinal = list(model[0].transformer_list[2][1].steps[1][1].feature_names_in_)
    nominal = list(model[0].transformer_list[1][1].steps[1][1].get_feature_names_out())
    all_columns = numerical + ordinal + nominal
    df = pd.DataFrame({'Feature':all_columns, 'Number of Split':model[1].feature_importances_}).sort_values('Number of Split', ascending = True).set_index('Feature')
    if plot == True:
        fig, ax = plt.subplots(figsize = figsize)
        df.tail(max_num_features).plot(kind = 'barh', ax = ax)
        ax.set_title('Feature Importance LGBM')
        plt.show()
    return df

### Baseline model

In this section, we'll try to train the model without feature engineering. This allow us to create the baseline model in which later we can compared this baseline model with other model using feature engineering.

#### Logistic Regression

In [6]:
%%time
# create baseline model for Logistic Regression: LogReg_baseline
from sklearn.linear_model import LogisticRegression

LogReg_features_baseline = get_feature_pipeline(numerical = num_columns, 
                                                nominal = nom_columns, 
                                                ordinal = ord_columns, 
                                                algorithm = 'Logistic Regression')
LogReg_baseline = Pipeline([('preprocessing_features', LogReg_features_baseline),
                            ('LogReg', LogisticRegression(n_jobs = -1, class_weight = 'balanced'))], verbose = True)
LogReg_baseline.fit(app_train.drop('TARGET', axis = 1),
                    app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.8s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  12.7s
done...
Wall time: 21.3 s


In [7]:
%%time
# evaluate LogReg_baseline
df_LogReg_baseline, y_pred_proba_test = model_evaluation(model = LogReg_baseline, 
                                                         tuning = False, 
                                                         train = app_train, 
                                                         test = app_test, 
                                                         name = 'LogReg_baseline')
display(df_LogReg_baseline)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.8s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   8.9s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.3s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   7.8s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   5.7s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   8.6s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.9s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   8.6s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   5.6s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   9.1s


confusion matrix and classification report "app_train": 
[[194163  88523]
 [  8082  16743]]
              precision    recall  f1-score   support

      repaid       0.96      0.69      0.80    282686
  not repaid       0.16      0.67      0.26     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LogReg_baseline,0.686,0.746 ± 0.001,0.743 ± 0.003,0.746


Wall time: 1min 46s


In [8]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LogReg_baseline.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.385175
1,100005,0.754684
2,100013,0.277328
3,100028,0.351405
4,100038,0.650782


In [9]:
df_LogReg_baseline['ROC AUC test'] = round(0.72998, 3)
df_LogReg_baseline

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LogReg_baseline,0.686,0.746 ± 0.001,0.743 ± 0.003,0.746,0.73


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.72998`

#### LightGBM

In [6]:
%%time
# create baseline model for LGBM: LGBM_baseline 
from lightgbm import LGBMClassifier

LGBM_features_baseline = get_feature_pipeline(numerical = num_columns, 
                                              nominal = nom_columns, 
                                              ordinal = ord_columns, 
                                              algorithm = 'GBM')
LGBM_baseline = Pipeline([('preprocessing_features', LGBM_features_baseline),
                          ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                                  objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                                  random_state = 42, subsample = 0.9))], verbose = True)
LGBM_baseline.fit(app_train.drop('TARGET', axis = 1),
                  app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   4.7s
done...
Wall time: 8.95 s


In [7]:
%%time
# evaluate LGBM_baseline
df_LGBM_baseline, y_pred_proba_test = model_evaluation(model = LGBM_baseline, 
                                                       tuning = False, 
                                                       train = app_train, 
                                                       test = app_test, 
                                                       name = 'LGBM_baseline')
display(df_LGBM_baseline)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   2.9s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   4.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   2.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   4.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   2.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   5.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   2.8s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   4.4s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   2.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   4.1s


confusion matrix and classification report "app_train": 
[[201199  81487]
 [  7070  17755]]
              precision    recall  f1-score   support

      repaid       0.97      0.71      0.82    282686
  not repaid       0.18      0.72      0.29     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_baseline,0.712,0.797 ± 0.0,0.757 ± 0.004,0.79


Wall time: 1min 9s


In [11]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_baseline.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.316399
1,100005,0.584404
2,100013,0.20788
3,100028,0.276125
4,100038,0.661614


In [12]:
df_LGBM_baseline['ROC AUC test'] = round(0.73993, 3)
df_LGBM_baseline

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_baseline,0.712,0.797 ± 0.0,0.757 ± 0.004,0.79,0.74


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.73993`

### Using polynomial features

#### Logistic Regression

In [14]:
%%time
# create model for Logistic Regression: LogReg_poly
from sklearn.linear_model import LogisticRegression

LogReg_features_poly = get_feature_pipeline(numerical = num_columns + poly_columns, 
                                            nominal = nom_columns, 
                                            ordinal = ord_columns, 
                                            algorithm = 'Logistic Regression')
LogReg_poly = Pipeline([('preprocessing_features', LogReg_features_poly),
                        ('LogReg', LogisticRegression(n_jobs = -1, class_weight = 'balanced'))], verbose = True)
LogReg_poly.fit(app_train.drop('TARGET', axis = 1),
                app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   9.8s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  13.7s
done...
Wall time: 25.2 s


In [15]:
%%time
# evaluate LogReg_poly
df_LogReg_poly, y_pred_proba_test = model_evaluation(model = LogReg_poly, 
                                                     tuning = False, 
                                                     train = app_train, 
                                                     test = app_test, 
                                                     name = 'LogReg_poly')
display(df_LogReg_poly)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  10.1s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  12.0s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.7s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   9.5s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  11.1s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  12.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.5s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  10.9s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.2s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   9.7s


confusion matrix and classification report "app_train": 
[[193705  88981]
 [  8048  16777]]
              precision    recall  f1-score   support

      repaid       0.96      0.69      0.80    282686
  not repaid       0.16      0.68      0.26     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LogReg_poly,0.684,0.746 ± 0.001,0.744 ± 0.003,0.746


Wall time: 2min 16s


In [16]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LogReg_poly.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.392892
1,100005,0.744332
2,100013,0.255074
3,100028,0.354664
4,100038,0.629516


In [17]:
df_LogReg_poly['ROC AUC test'] = round(0.73014, 3)
df_LogReg_poly

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LogReg_poly,0.684,0.746 ± 0.001,0.744 ± 0.003,0.746,0.73


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.73014`

#### LightGBM

In [18]:
%%time
# create model for LGBM: LGBM_poly 
from lightgbm import LGBMClassifier

LGBM_features_poly = get_feature_pipeline(numerical = num_columns + poly_columns, 
                                          nominal = nom_columns, 
                                          ordinal = ord_columns, 
                                          algorithm = 'GBM')
LGBM_poly = Pipeline([('preprocessing_features', LGBM_features_poly),
                      ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                              objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                              random_state = 42, subsample = 0.9))], verbose = True)
LGBM_poly.fit(app_train.drop('TARGET', axis = 1),
              app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   5.3s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   8.6s
done...
Wall time: 17.6 s


In [19]:
%%time
# evaluate LGBM_poly
df_LGBM_poly, y_pred_proba_test = model_evaluation(model = LGBM_poly, 
                                                   tuning = False, 
                                                   train = app_train, 
                                                   test = app_test, 
                                                   name = 'LGBM_poly')
display(df_LGBM_poly)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.1s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   7.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   7.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.6s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   8.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.6s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   7.5s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   4.1s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   9.2s


confusion matrix and classification report "app_train": 
[[199958  82728]
 [  6965  17860]]
              precision    recall  f1-score   support

      repaid       0.97      0.71      0.82    282686
  not repaid       0.18      0.72      0.28     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_poly,0.708,0.797 ± 0.001,0.757 ± 0.003,0.79


Wall time: 1min 51s


In [20]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_poly.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.303771
1,100005,0.583877
2,100013,0.16108
3,100028,0.252024
4,100038,0.684486


In [21]:
df_LGBM_poly['ROC AUC test'] = round(0.74071, 3)
df_LGBM_poly

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_poly,0.708,0.797 ± 0.001,0.757 ± 0.003,0.79,0.741


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.74071`

### Using domain knowledge features

#### Logistic Regression

In [22]:
%%time
# create model for Logistic Regression: LogReg_domain
from sklearn.linear_model import LogisticRegression

LogReg_features_domain = get_feature_pipeline(numerical = num_columns + creation_columns, 
                                              nominal = nom_columns, 
                                              ordinal = ord_columns, 
                                              algorithm = 'Logistic Regression')
LogReg_domain = Pipeline([('preprocessing_features', LogReg_features_domain),
                          ('LogReg', LogisticRegression(n_jobs = -1, class_weight = 'balanced'))], verbose = True)
LogReg_domain.fit(app_train.drop('TARGET', axis = 1),
                  app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  10.1s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  13.5s
done...
Wall time: 26.3 s


In [23]:
%%time
# evaluate LogReg_domain
df_LogReg_domain, y_pred_proba_test = model_evaluation(model = LogReg_domain, 
                                                       tuning = False, 
                                                       train = app_train, 
                                                       test = app_test, 
                                                       name = 'LogReg_domain')
display(df_LogReg_domain)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   9.4s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  12.6s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.6s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   9.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.5s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   8.8s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.3s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   9.4s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   5.9s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   8.4s


confusion matrix and classification report "app_train": 
[[194289  88397]
 [  7891  16934]]
              precision    recall  f1-score   support

      repaid       0.96      0.69      0.80    282686
  not repaid       0.16      0.68      0.26     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LogReg_domain,0.687,0.751 ± 0.001,0.748 ± 0.003,0.751


Wall time: 1min 58s


In [24]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LogReg_domain.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.379465
1,100005,0.70102
2,100013,0.289044
3,100028,0.240779
4,100038,0.64169


In [25]:
df_LogReg_domain['ROC AUC test'] = round(0.73602, 3)
df_LogReg_domain

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LogReg_domain,0.687,0.751 ± 0.001,0.748 ± 0.003,0.751,0.736


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.73602`

#### LightGBM

In [26]:
%%time
# create model for LGBM: LGBM_domain 
from lightgbm import LGBMClassifier

LGBM_features_domain = get_feature_pipeline(numerical = num_columns + creation_columns, 
                                            nominal = nom_columns, 
                                            ordinal = ord_columns, 
                                            algorithm = 'GBM')
LGBM_domain = Pipeline([('preprocessing_features', LGBM_features_domain),
                        ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                                objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                                random_state = 42, subsample = 0.9))], verbose = True)
LGBM_domain.fit(app_train.drop('TARGET', axis = 1),
                app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   4.6s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   8.7s
done...
Wall time: 17.4 s


In [27]:
%%time
# evaluate LGBM_domain
df_LGBM_domain, y_pred_proba_test = model_evaluation(model = LGBM_domain, 
                                                     tuning = False, 
                                                     train = app_train, 
                                                     test = app_test, 
                                                     name = 'LGBM_domain')
display(df_LGBM_domain)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   4.3s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   7.3s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.3s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   7.2s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.1s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   7.0s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.0s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   6.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   3.1s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   6.9s


confusion matrix and classification report "app_train": 
[[202441  80245]
 [  6800  18025]]
              precision    recall  f1-score   support

      repaid       0.97      0.72      0.82    282686
  not repaid       0.18      0.73      0.29     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_domain,0.717,0.803 ± 0.001,0.764 ± 0.003,0.798


Wall time: 1min 33s


In [28]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_domain.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.23134
1,100005,0.494834
2,100013,0.131039
3,100028,0.25627
4,100038,0.646026


In [29]:
df_LGBM_domain['ROC AUC test'] = round(0.75632, 3)
df_LGBM_domain

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_domain,0.717,0.803 ± 0.001,0.764 ± 0.003,0.798,0.756


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.75632`

### Using other sources: bureau and burueau_balance

#### Logistic Regression

In [30]:
%%time
# create model for Logistic Regression: LogReg_bureau
from sklearn.linear_model import LogisticRegression

LogReg_features_bureau = get_feature_pipeline(numerical = num_columns + bureau_app_columns, 
                                              nominal = nom_columns, 
                                              ordinal = ord_columns, 
                                              algorithm = 'Logistic Regression')
LogReg_bureau = Pipeline([('preprocessing_features', LogReg_features_bureau),
                          ('LogReg', LogisticRegression(n_jobs = -1, class_weight = 'balanced'))], verbose = True)
LogReg_bureau.fit(app_train.drop('TARGET', axis = 1),
                  app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  17.1s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  19.9s
done...
Wall time: 37.8 s


In [31]:
%%time
# evaluate LogReg_bureau
df_LogReg_bureau, y_pred_proba_test = model_evaluation(model = LogReg_bureau, 
                                                       tuning = False, 
                                                       train = app_train, 
                                                       test = app_test, 
                                                       name = 'LogReg_bureau')
display(df_LogReg_bureau)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  14.3s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  13.3s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  11.9s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  12.6s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  11.9s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  12.3s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  12.2s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  11.9s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  12.3s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  11.3s


confusion matrix and classification report "app_train": 
[[194681  88005]
 [  7953  16872]]
              precision    recall  f1-score   support

      repaid       0.96      0.69      0.80    282686
  not repaid       0.16      0.68      0.26     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LogReg_bureau,0.688,0.75 ± 0.001,0.747 ± 0.002,0.75


Wall time: 2min 54s


In [32]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LogReg_bureau.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.404736
1,100005,0.727163
2,100013,0.177393
3,100028,0.341355
4,100038,0.646729


In [33]:
df_LogReg_bureau['ROC AUC test'] = round(0.73644, 3)
df_LogReg_bureau

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LogReg_bureau,0.688,0.75 ± 0.001,0.747 ± 0.002,0.75,0.736


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.73644`

#### LightGBM

In [34]:
%%time
# create model for LGBM: LGBM_bureau
from lightgbm import LGBMClassifier

LGBM_features_bureau = get_feature_pipeline(numerical = num_columns + bureau_app_columns, 
                                            nominal = nom_columns, 
                                            ordinal = ord_columns, 
                                            algorithm = 'GBM')
LGBM_bureau = Pipeline([('preprocessing_features', LGBM_features_bureau),
                        ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                                objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                                random_state = 42, subsample = 0.9))], verbose = True)
LGBM_bureau.fit(app_train.drop('TARGET', axis = 1),
                app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.5s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  12.3s
done...
Wall time: 20.4 s


In [35]:
%%time
# evaluate LGBM_bureau
df_LGBM_bureau, y_pred_proba_test = model_evaluation(model = LGBM_bureau, 
                                                     tuning = False, 
                                                     train = app_train, 
                                                     test = app_test, 
                                                     name = 'LGBM_bureau')
display(df_LGBM_bureau)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   5.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   9.0s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.0s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  11.4s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   5.9s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  13.3s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   5.8s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  10.6s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   6.1s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  14.5s


confusion matrix and classification report "app_train": 
[[203240  79446]
 [  6793  18032]]
              precision    recall  f1-score   support

      repaid       0.97      0.72      0.82    282686
  not repaid       0.18      0.73      0.29     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_bureau,0.72,0.807 ± 0.001,0.763 ± 0.002,0.8


Wall time: 2min 35s


In [36]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_bureau.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.285444
1,100005,0.603741
2,100013,0.183332
3,100028,0.285263
4,100038,0.671786


In [37]:
df_LGBM_bureau['ROC AUC test'] = round(0.75027, 3)
df_LGBM_bureau

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_bureau,0.72,0.807 ± 0.001,0.763 ± 0.002,0.8,0.75


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.75027`

### Using other sources: previous_app, credit_card_balance, installments_payments, and POS_CASH_balance

#### Logistic Regression

In [38]:
%%time
# create model for Logistic Regression: LogReg_previous
from sklearn.linear_model import LogisticRegression

LogReg_features_previous = get_feature_pipeline(numerical = num_columns + previous_app_columns, 
                                                nominal = nom_columns, 
                                                ordinal = ord_columns, 
                                                algorithm = 'Logistic Regression')
LogReg_previous = Pipeline([('preprocessing_features', LogReg_features_previous),
                            ('LogReg', LogisticRegression(n_jobs = -1, class_weight = 'balanced'))], verbose = True)
LogReg_previous.fit(app_train.drop('TARGET', axis = 1),
                    app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  47.8s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  28.5s
done...
Wall time: 1min 18s


In [39]:
%%time
# evaluate LogReg_previous
df_LogReg_previous, y_pred_proba_test = model_evaluation(model = LogReg_previous, 
                                                         tuning = False, 
                                                         train = app_train, 
                                                         test = app_test, 
                                                         name = 'LogReg_previous')
display(df_LogReg_previous)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  33.8s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  24.4s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  24.8s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  24.9s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  25.1s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  23.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  23.5s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  23.4s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  23.8s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=  22.9s


confusion matrix and classification report "app_train": 
[[198250  84436]
 [  7592  17233]]
              precision    recall  f1-score   support

      repaid       0.96      0.70      0.81    282686
  not repaid       0.17      0.69      0.27     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LogReg_previous,0.701,0.765 ± 0.001,0.761 ± 0.003,0.766


Wall time: 7min 32s


In [40]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LogReg_previous.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.451521
1,100005,0.723502
2,100013,0.270515
3,100028,0.357498
4,100038,0.636697


In [41]:
df_LogReg_previous['ROC AUC test'] = round(0.75188, 3)
df_LogReg_previous

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LogReg_previous,0.701,0.765 ± 0.001,0.761 ± 0.003,0.766,0.752


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.75188`

#### LightGBM

In [42]:
%%time
# create model for LGBM: LGBM_previous
from lightgbm import LGBMClassifier

LGBM_features_previous = get_feature_pipeline(numerical = num_columns + previous_app_columns, 
                                              nominal = nom_columns, 
                                              ordinal = ord_columns, 
                                              algorithm = 'GBM')
LGBM_previous = Pipeline([('preprocessing_features', LGBM_features_previous),
                          ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                                  objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                                  random_state = 42, subsample = 0.9))], verbose = True)
LGBM_previous.fit(app_train.drop('TARGET', axis = 1),
                  app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  33.9s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  26.4s
done...
Wall time: 2min 38s


In [43]:
%%time
# evaluate LGBM_bureau
df_LGBM_previous, y_pred_proba_test = model_evaluation(model = LGBM_previous, 
                                                       tuning = False, 
                                                       train = app_train, 
                                                       test = app_test, 
                                                       name = 'LGBM_previous')
display(df_LGBM_previous)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  13.4s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  24.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  13.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  25.2s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  12.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  23.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  11.3s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  22.2s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  10.5s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  20.7s


confusion matrix and classification report "app_train": 
[[205321  77365]
 [  6354  18471]]
              precision    recall  f1-score   support

      repaid       0.97      0.73      0.83    282686
  not repaid       0.19      0.74      0.31     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_previous,0.728,0.822 ± 0.001,0.775 ± 0.004,0.814


Wall time: 6min 39s


In [44]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_previous.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.229201
1,100005,0.624715
2,100013,0.201213
3,100028,0.205693
4,100038,0.685995


In [45]:
df_LGBM_previous['ROC AUC test'] = round(0.76260, 3)
df_LGBM_previous

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_previous,0.728,0.822 ± 0.001,0.775 ± 0.004,0.814,0.763


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.76260`

### Combine all the features and perform feature selection

Overall, we see that after some feature engineering are conducted, the `ROC AUC test` score increases. Particulary when we use engineered features from domain knowledge (`creation_columns`), bureau app (`bureau_app_columns`), and previous app (`previous_app_columns`). We will try to select all these features and train the machine learning model with these features.

We will use `LGBM` model for now since this model outperforms `Logistic Regression`.

As we do so, we also want to perform feature selection of all these features and compare them all in order to obtain the one has the best performance. The feature selection method we are used:
- Remove features with missing values more than 50%.
- Remove features with all missing values.
- Remove any features with a zero importance as determined by a gradient boosting machine (`LGBM`).

In [46]:
all_columns = num_columns + nom_columns + ord_columns + creation_columns + bureau_app_columns + previous_app_columns
print('Total features:', len(all_columns))

Total features: 691


In [47]:
def filter_missing_values(df, threshold):
    percent_missing = df.isnull().sum() / df.shape[0] * 100
    percent_missing = percent_missing.sort_values(ascending = False)
    percent_missing = percent_missing[percent_missing < threshold]
    selected_columns = list(percent_missing.index)
    return selected_columns

#### Combine all features

In [6]:
%%time
# create model for LGBM: LGBM_total
from lightgbm import LGBMClassifier

LGBM_features_total = get_feature_pipeline(numerical = num_columns + creation_columns + bureau_app_columns + previous_app_columns, 
                                           nominal = nom_columns, 
                                           ordinal = ord_columns, 
                                           algorithm = 'GBM')
LGBM_total = Pipeline([('preprocessing_features', LGBM_features_total),
                       ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                               objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                               random_state = 42, subsample = 0.9))], verbose = True)
LGBM_total.fit(app_train.drop('TARGET', axis = 1),
               app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  52.1s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  36.2s
done...
Wall time: 1min 28s


In [7]:
%%time
# evaluate LGBM_total
df_LGBM_total, y_pred_proba_test = model_evaluation(model = LGBM_total, 
                                                    tuning = False, 
                                                    train = app_train, 
                                                    test = app_test, 
                                                    name = 'LGBM_total')
display(df_LGBM_total)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  22.6s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  32.5s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  25.8s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  33.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  23.3s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  31.4s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  23.7s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  31.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  29.5s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  31.7s


confusion matrix and classification report "app_train": 
[[207873  74813]
 [  6228  18597]]
              precision    recall  f1-score   support

      repaid       0.97      0.74      0.84    282686
  not repaid       0.20      0.75      0.31     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_total,0.736,0.829 ± 0.0,0.781 ± 0.003,0.821


Wall time: 8min 30s


In [8]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_total.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.306396
1,100005,0.598016
2,100013,0.184054
3,100028,0.264778
4,100038,0.691508


In [9]:
df_LGBM_total['ROC AUC test'] = round(0.77693, 3)
df_LGBM_total

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_total,0.736,0.829 ± 0.0,0.781 ± 0.003,0.821,0.777


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.77693`

#### Remove missing values more than 50%

In [52]:
# select all columns with less than 50% of missing values
select_columns = filter_missing_values(df = app_train[all_columns], 
                                       threshold = 50)
num_columns_1 = [col for col in num_columns if col in select_columns]
nom_columns_1 = [col for col in nom_columns if col in select_columns]
ord_columns_1 = [col for col in ord_columns if col in select_columns]
creation_columns_1 = [col for col in creation_columns if col in select_columns]
bureau_app_columns_1 = [col for col in bureau_app_columns if col in select_columns]
previous_app_columns_1 = [col for col in previous_app_columns if col in select_columns]
print('Check length:', len(num_columns_1) + len(nom_columns_1) + len(ord_columns_1) + len(creation_columns_1) + len(bureau_app_columns_1) + len(previous_app_columns_1) == len(select_columns))
print('From {} total features... we have selected {} features'.format(len(all_columns), len(select_columns)))

Check length: True
From 691 total features... we have selected 527 features


In [53]:
%%time
# create model for LGBM: LGBM_missing50
from lightgbm import LGBMClassifier

LGBM_features_missing50 = get_feature_pipeline(numerical = num_columns_1 + creation_columns_1 + bureau_app_columns_1 + previous_app_columns_1, 
                                               nominal = nom_columns_1, 
                                               ordinal = ord_columns_1, 
                                               algorithm = 'GBM')
LGBM_missing50 = Pipeline([('preprocessing_features', LGBM_features_missing50),
                           ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                                   objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                                   random_state = 42, subsample = 0.9))], verbose = True)
LGBM_missing50.fit(app_train.drop('TARGET', axis = 1),
                   app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  23.6s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  29.4s
done...
Wall time: 56 s


In [54]:
%%time
# evaluate LGBM_total
df_LGBM_missing50, y_pred_proba_test = model_evaluation(model = LGBM_missing50, 
                                                        tuning = False, 
                                                        train = app_train, 
                                                        test = app_test, 
                                                        name = 'LGBM_missing50')
display(df_LGBM_missing50)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  13.0s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  26.1s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  15.8s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  23.6s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  11.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  22.5s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  11.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  25.0s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  12.9s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  25.9s


confusion matrix and classification report "app_train": 
[[207414  75272]
 [  6346  18479]]
              precision    recall  f1-score   support

      repaid       0.97      0.73      0.84    282686
  not repaid       0.20      0.74      0.31     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_missing50,0.735,0.825 ± 0.0,0.778 ± 0.003,0.818


Wall time: 6min 56s


In [55]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_missing50.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.270755
1,100005,0.542408
2,100013,0.149065
3,100028,0.229825
4,100038,0.554794


In [56]:
df_LGBM_missing50['ROC AUC test'] = round(0.77154, 3)
df_LGBM_missing50

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_missing50,0.735,0.825 ± 0.0,0.778 ± 0.003,0.818,0.772


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.77154`

#### Remove all missing values

In [57]:
# select all columns with no missing values
select_columns = filter_missing_values(df = app_train[all_columns], 
                                       threshold = 0.0000001)
num_columns_2 = [col for col in num_columns if col in select_columns]
nom_columns_2 = [col for col in nom_columns if col in select_columns]
ord_columns_2 = [col for col in ord_columns if col in select_columns]
creation_columns_2 = [col for col in creation_columns if col in select_columns]
bureau_app_columns_2 = [col for col in bureau_app_columns if col in select_columns]
previous_app_columns_2 = [col for col in previous_app_columns if col in select_columns]
print('Check length:', len(num_columns_2) + len(nom_columns_2) + len(ord_columns_2) + len(creation_columns_2) + len(bureau_app_columns_2) + len(previous_app_columns_2) == len(select_columns))
print('From {} total features... we have selected {} features'.format(len(all_columns), len(select_columns)))

Check length: True
From 691 total features... we have selected 44 features


In [58]:
%%time
# create model for LGBM: LGBM_missing0
from lightgbm import LGBMClassifier

LGBM_features_missing0 = get_feature_pipeline(numerical = num_columns_2 + creation_columns_2 + bureau_app_columns_2 + previous_app_columns_2, 
                                              nominal = nom_columns_2, 
                                              ordinal = ord_columns_2, 
                                              algorithm = 'GBM')
LGBM_missing0 = Pipeline([('preprocessing_features', LGBM_features_missing0),
                          ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                                  objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                                  random_state = 42, subsample = 0.9))], verbose = True)
LGBM_missing0.fit(app_train.drop('TARGET', axis = 1),
                  app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   1.5s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   2.1s
done...
Wall time: 4.53 s


In [59]:
%%time
# evaluate LGBM_total
df_LGBM_missing0, y_pred_proba_test = model_evaluation(model = LGBM_missing0, 
                                                        tuning = False, 
                                                        train = app_train, 
                                                        test = app_test, 
                                                        name = 'LGBM_missing0')
display(df_LGBM_missing0)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   1.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   1.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   1.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   1.8s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   1.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   1.8s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   1.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   1.8s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   1.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=   1.8s


confusion matrix and classification report "app_train": 
[[192382  90304]
 [  7610  17215]]
              precision    recall  f1-score   support

      repaid       0.96      0.68      0.80    282686
  not repaid       0.16      0.69      0.26     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_missing0,0.682,0.761 ± 0.001,0.725 ± 0.003,0.755


Wall time: 35 s


In [60]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_missing0.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.252077
1,100005,0.38909
2,100013,0.25611
3,100028,0.319569
4,100038,0.627514


In [61]:
df_LGBM_missing0['ROC AUC test'] = round(0.70272, 3)
df_LGBM_missing0

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_missing0,0.682,0.761 ± 0.001,0.725 ± 0.003,0.755,0.703


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.70272`

#### Remove any features with zero importances from LGBM model

In [62]:
# select only non zero importance features from 'LGBM_total'
feature_importance_LGBM_total = feature_importance(LGBM_total, 
                                                   plot = False)
select_columns = list(feature_importance_LGBM_total[feature_importance_LGBM_total['Number of Split'] > 0].index)
num_columns_3 = [col for col in select_columns if col in num_columns]
nom_columns_3 = [col for col in select_columns if col in nom_columns]
ord_columns_3 = [col for col in select_columns if col in ord_columns]
creation_columns_3 = [col for col in select_columns if col in creation_columns]
bureau_app_columns_3 = [col for col in select_columns if col in bureau_app_columns]
previous_app_columns_3 = [col for col in select_columns if col in previous_app_columns]

print('From {} total features... we have selected {} features'.format(len(all_columns), len(select_columns)))

From 691 total features... we have selected 400 features


In [63]:
%%time
# create model for LGBM: LGBM_importance
from lightgbm import LGBMClassifier

LGBM_features_importance = get_feature_pipeline(numerical = num_columns_3 + creation_columns_3 + bureau_app_columns_3 + previous_app_columns_3, 
                                                nominal = nom_columns_3, 
                                                ordinal = ord_columns_3, 
                                                algorithm = 'GBM')
LGBM_importance = Pipeline([('preprocessing_features', LGBM_features_importance),
                            ('LGBM', LGBMClassifier(n_jobs = -1, class_weight = 'balanced', 
                                                    objective = 'binary', reg_alpha = 0.1, reg_lambda = 0.1,
                                                    random_state = 42, subsample = 0.9))], verbose = True)
LGBM_importance.fit(app_train.drop('TARGET', axis = 1),
                    app_train[['TARGET']])
print('done...')

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=  10.0s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  23.0s
done...
Wall time: 33.6 s


In [64]:
%%time
# evaluate LGBM_importance
df_LGBM_importance, y_pred_proba_test = model_evaluation(model = LGBM_importance, 
                                                         tuning = False, 
                                                         train = app_train, 
                                                         test = app_test, 
                                                         name = 'LGBM_importance')
display(df_LGBM_importance)

[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.3s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  18.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   8.2s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  19.7s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.9s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  19.9s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   8.0s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  21.0s
[Pipeline]  (step 1 of 2) Processing preprocessing_features, total=   7.4s
[Pipeline] .............. (step 2 of 2) Processing LGBM, total=  18.6s


confusion matrix and classification report "app_train": 
[[207093  75593]
 [  6341  18484]]
              precision    recall  f1-score   support

      repaid       0.97      0.73      0.83    282686
  not repaid       0.20      0.74      0.31     24825

    accuracy

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
LGBM_importance,0.734,0.825 ± 0.0,0.777 ± 0.002,0.817


Wall time: 3min 42s


In [65]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/LGBM_importance.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.362727
1,100005,0.546602
2,100013,0.167219
3,100028,0.228348
4,100038,0.653252


In [66]:
df_LGBM_importance['ROC AUC test'] = round(0.77201, 3)
df_LGBM_importance

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_importance,0.734,0.825 ± 0.0,0.777 ± 0.002,0.817,0.772


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.77201`

# Model performances summary

From all the trained models using various of feature engineering techniques, the best performed model is `LGBM_total`.

`LGBM_total` is trained using
- all significant features from `app_train` (in `num_columns`, `nom_columns`, and `ord_columns`), 
- domain knowledge features (`creation_columns`), 
- bureau application dataset (`bureau_app_columns`), and 
- previous application dataset (`previous_app_columns`).

Hence we will use this model and perform hyperparameter tuning in the next notebook, `Part 3 - Hyperparameter Tuning and Interpretation`.

In [71]:
# sum up all the model performance into a single dataframe
df_model_performance_LogReg = pd.concat([df_LogReg_baseline, df_LogReg_poly, df_LogReg_domain, df_LogReg_bureau, df_LogReg_previous], axis = 0)
display(df_model_performance_LogReg)

df_model_performance_LGBM = pd.concat([df_LGBM_baseline, df_LGBM_poly, df_LGBM_domain, df_LGBM_bureau, df_LGBM_previous,
                                       df_LGBM_total, df_LGBM_missing50, df_LGBM_missing0, df_LGBM_importance], axis = 0)
display(df_model_performance_LGBM.style.highlight_max(color = '#F7DC6F').set_precision(3))

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LogReg_baseline,0.686,0.746 ± 0.001,0.743 ± 0.003,0.746,0.73
LogReg_poly,0.684,0.746 ± 0.001,0.744 ± 0.003,0.746,0.73
LogReg_domain,0.687,0.751 ± 0.001,0.748 ± 0.003,0.751,0.736
LogReg_bureau,0.688,0.75 ± 0.001,0.747 ± 0.002,0.75,0.736
LogReg_previous,0.701,0.765 ± 0.001,0.761 ± 0.003,0.766,0.752


Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
LGBM_baseline,0.712,0.797 ± 0.0,0.757 ± 0.004,0.79,0.74
LGBM_poly,0.708,0.797 ± 0.001,0.757 ± 0.003,0.79,0.741
LGBM_domain,0.717,0.803 ± 0.001,0.764 ± 0.003,0.798,0.756
LGBM_bureau,0.72,0.807 ± 0.001,0.763 ± 0.002,0.8,0.75
LGBM_previous,0.728,0.822 ± 0.001,0.775 ± 0.004,0.814,0.763
LGBM_total,0.736,0.829 ± 0.0,0.781 ± 0.003,0.821,0.777
LGBM_missing50,0.735,0.825 ± 0.0,0.778 ± 0.003,0.818,0.772
LGBM_missing0,0.682,0.761 ± 0.001,0.725 ± 0.003,0.755,0.703
LGBM_importance,0.734,0.825 ± 0.0,0.777 ± 0.002,0.817,0.772


# Save the best model

In [14]:
from sklearn.linear_model import LogisticRegression
def check_fitted(clf): 
    return hasattr(clf, "classes_")

no_trained_model_1 = LGBMClassifier()
no_trained_model_2 = LogisticRegression()
check_fitted(no_trained_model_2)

False

In [16]:
import pickle
# save the model to disk
filename = 'LGBM_total.sav'
pickle.dump(LGBM_total[1], open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model)
print(check_fitted(loaded_model))

In [19]:
import pickle
# save the model to disk
filename = 'LGBM_baseline.sav'
pickle.dump(LGBM_baseline[1], open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model)
print(check_fitted(loaded_model))

LGBMClassifier(class_weight='balanced', objective='binary', random_state=42,
               reg_alpha=0.1, reg_lambda=0.1, subsample=0.9)
True
