# Load data and library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
pd.options.display.max_columns = None
plt.rcParams.update(plt.rcParamsDefault)
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
import sys
import ast

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.model_selection import cross_validate, RandomizedSearchCV

from lightgbm import LGBMClassifier

In [2]:
# helper function to get the feature preprocessing pipeline
def get_feature_pipeline(numerical, nominal, ordinal, algorithm):
    preprocess_numerical = FunctionTransformer(lambda x: x[numerical], validate = False)
    preprocess_nominal = FunctionTransformer(lambda x: x[nominal], validate = False)
    preprocess_ordinal = FunctionTransformer(lambda x: x[ordinal], validate = False)
    if algorithm == 'Logistic Regression':
        pl_numerical = Pipeline([('selector_numerical', preprocess_numerical),
                                 ('imputer', SimpleImputer(strategy = 'median')),
                                 ('scaler', MinMaxScaler())])                    
        pl_nominal = Pipeline([('selector_nominal', preprocess_nominal),
                               ('imputer', SimpleImputer(strategy = 'most_frequent')),
                               ('encoder', OneHotEncoder())])
        pl_ordinal = Pipeline([('selector_ordinal', preprocess_ordinal),
                               ('imputer', SimpleImputer(strategy = 'most_frequent')),
                               ('encoder', OrdinalEncoder())])
        feature_pipeline = FeatureUnion([('pipeline_numerical', pl_numerical),
                                         ('pipeline_nominal', pl_nominal),
                                         ('pipeline_ordinal', pl_ordinal)])
        return feature_pipeline
    
    elif algorithm == 'GBM':
        pl_numerical = Pipeline([('selector_numerical', preprocess_numerical)])
        pl_nominal = Pipeline([('selector_nominal', preprocess_nominal),
                               ('encoder', OneHotEncoder())])
        pl_ordinal = Pipeline([('selector_ordinal', preprocess_ordinal),
                               ('encoder', OrdinalEncoder())])
        feature_pipeline = FeatureUnion([('pipeline_numerical', pl_numerical),
                                         ('pipeline_nominal', pl_nominal),
                                         ('pipeline_ordinal', pl_ordinal)])
        return feature_pipeline
    else:
        print('algorithm argument is wrong. Try "Logistic Regression" or "GBM"!')
        return None

# helper function to evaluate the model performance
def model_evaluation(model, train, test, name):
    """ Parameter:
            model - trained model
            train - list of train feature and train target ---> [X_train, y_train]
            test - test feature ---> X_test
            name - trained model's name
    """
    X_train = train[0]; y_train = train[1]
    X_test = test
    
    # predict the model
    y_pred_train = model.predict(X_train)
    y_pred_proba_train = model.predict_proba(X_train)[:, 1]
    y_pred_proba_test = model.predict_proba(X_test)[:, 1]
    
    # training model performance
    accuracy_train = metrics.accuracy_score(y_train, y_pred_train)
    roc_auc_train = metrics.roc_auc_score(y_train, y_pred_proba_train)
    roc_auc_cv = cross_validate(estimator = model, X = X_train, y = y_train, cv = 5, scoring = 'roc_auc', return_train_score = True)
    dict_model_performance = {'Accuracy train':accuracy_train.round(3),
                              'ROC AUC 5-CV train':'{} ± {}'.format(roc_auc_cv['train_score'].mean().round(3), roc_auc_cv['train_score'].std().round(3)),
                              'ROC AUC 5-CV validate':'{} ± {}'.format(roc_auc_cv['test_score'].mean().round(3), roc_auc_cv['test_score'].std().round(3)), 
                              'ROC AUC train':roc_auc_train.round(3)}
    print('\n')
    print ('======== model evaluation metrics "{}" ========'.format(name))
    print('confusion matrix and classification report "app_train": \n{0}\n{1}'.format(metrics.confusion_matrix(y_train, y_pred_train), 
                                                                                      metrics.classification_report(y_train, y_pred_train, target_names = ['repaid', 'not repaid'])))
    df = pd.DataFrame([dict_model_performance])
    df.rename(index = {0:name}, inplace = True)
    return df, y_pred_proba_test

In [3]:
%%time
# load the final train and test dataset
app_train = pd.read_csv('final train and test dataset/app_train_final.csv')
app_test = pd.read_csv('final train and test dataset/app_test_final.csv')

# load the column names
column_names = pd.read_csv('final train and test dataset/column_names.csv')

num_columns = ast.literal_eval(column_names[column_names['variable'] == 'num_columns']['list'].tolist()[0])
cat_columns = ast.literal_eval(column_names[column_names['variable'] == 'cat_columns']['list'].tolist()[0])
nom_columns = ast.literal_eval(column_names[column_names['variable'] == 'nom_columns']['list'].tolist()[0])
ord_columns = ast.literal_eval(column_names[column_names['variable'] == 'ord_columns']['list'].tolist()[0])
poly_columns = ast.literal_eval(column_names[column_names['variable'] == 'poly_columns']['list'].tolist()[0])
creation_columns = ast.literal_eval(column_names[column_names['variable'] == 'creation_columns']['list'].tolist()[0])
bureau_app_columns = ast.literal_eval(column_names[column_names['variable'] == 'bureau_app_columns']['list'].tolist()[0])
previous_app_columns = ast.literal_eval(column_names[column_names['variable'] == 'previous_app_columns']['list'].tolist()[0])

Wall time: 29.4 s


In [4]:
print('app_train shape:', app_train.shape)
print('app_test shape:', app_test.shape)

app_train shape: (307511, 709)
app_test shape: (48744, 708)


In [5]:
app_train['CODE_GENDER'] = pd.Categorical(app_train['CODE_GENDER'], ordered = True, categories = ['F', 'M'])
app_test['CODE_GENDER'] = pd.Categorical(app_test['CODE_GENDER'], ordered = True, categories = ['F', 'M'])

app_train['FLAG_OWN_CAR'] = pd.Categorical(app_train['FLAG_OWN_CAR'], ordered = True, categories = ['N', 'Y'])
app_test['FLAG_OWN_CAR'] = pd.Categorical(app_test['FLAG_OWN_CAR'], ordered = True, categories = ['N', 'Y'])

app_train['FLAG_OWN_REALTY'] = pd.Categorical(app_train['FLAG_OWN_REALTY'], ordered = True, categories = ['N', 'Y'])
app_test['FLAG_OWN_REALTY'] = pd.Categorical(app_test['FLAG_OWN_REALTY'], ordered = True, categories = ['N', 'Y'])

app_train['YEARS_BIRTH_SEGMENT'] = pd.Categorical(app_train['YEARS_BIRTH_SEGMENT'], ordered = True)
app_test['YEARS_BIRTH_SEGMENT'] = pd.Categorical(app_test['YEARS_BIRTH_SEGMENT'], ordered = True)

# Hyperparameter tuning

The best model we see from previous notebook `Part 2 - Modeling` is LGBM model using features from `num_columns`, `nom_columns`, `ord_columns`, `creation_columns`, `bureau_app_columns`, and `previous_app_columns`. Therefore we will try to tune LGBM's hyperparameter model using those features in order to get the better performance.

In [6]:
%%time
# preprocessing the features

# set the feature pipeline
LGBM_features_total = get_feature_pipeline(numerical = num_columns + creation_columns + bureau_app_columns + previous_app_columns, 
                                           nominal = nom_columns, 
                                           ordinal = ord_columns, 
                                           algorithm = 'GBM')
# train data and test data
X_train = LGBM_features_total.fit_transform(app_train)
y_train = app_train[['TARGET']]
X_test = LGBM_features_total.transform(app_test)
display(X_train)
display(X_test)

# extract the features from preprocessing pipeline
num_after = LGBM_features_total.transformer_list[0][1].steps[0][1].transform(app_train).columns.tolist()
nom_after = LGBM_features_total.transformer_list[1][1].steps[1][1].get_feature_names_out().tolist()
ord_after = LGBM_features_total.transformer_list[2][1].steps[1][1].feature_names_in_.tolist()
all_columns_after = num_after + nom_after + ord_after
print('Total features after preprocessing in Pipeline:', len(all_columns_after))

<307511x793 sparse matrix of type '<class 'numpy.float64'>'
	with 132771518 stored elements in Compressed Sparse Row format>

<48744x793 sparse matrix of type '<class 'numpy.float64'>'
	with 20640285 stored elements in Compressed Sparse Row format>

Total features after preprocessing in Pipeline: 793
Wall time: 1min 21s


## Random search

In [7]:
# hyperparameter grid
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 50)),
    'max_depth': list(np.arange(1, 10, 1)),
    'reg_alpha': list(np.linspace(0, 1, 15)),
    'reg_lambda': list(np.linspace(0, 1, 15)),
    'colsample_bytree': list(np.linspace(0.5, 1, 15)),
    'subsample': list(np.linspace(0.5, 1, 15)),
    'n_estimators': list(np.arange(100, 751, 50))}

# set the classifier pipeline
model = LGBMClassifier(class_weight = 'balanced', 
                       objective = 'binary', random_state = 42, n_jobs = -1)

# instantiate RandomizedSearchCV object
random_LGBM_total = RandomizedSearchCV(model, param_grid, scoring = 'roc_auc', cv = 5, n_iter = 100, 
                                       n_jobs = -1, random_state = 42, verbose = 3)

In [8]:
%%time
random_LGBM_total.fit(X_train, y_train)
print('done...')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
done...
Wall time: 9h 45min 31s


In [9]:
tuned_LGBM_1 = random_LGBM_total.best_estimator_
tuned_LGBM_1

LGBMClassifier(boosting_type='goss', class_weight='balanced',
               learning_rate=0.020474575311902114, max_depth=9,
               n_estimators=750, objective='binary', random_state=42,
               reg_alpha=0.7142857142857142, reg_lambda=0.5714285714285714,
               subsample=0.5714285714285714)

In [10]:
import pickle
# save the model to disk
filename = 'LGBM_total - Object RandomizedSearachCV (tuned_LGBM_1).sav'
pickle.dump(random_LGBM_total, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

RandomizedSearchCV(cv=5,
                   estimator=LGBMClassifier(class_weight='balanced',
                                            objective='binary',
                                            random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt', 'goss',
                                                          'dart'],
                                        'colsample_bytree': [0.5,
                                                             0.5357142857142857,
                                                             0.5714285714285714,
                                                             0.6071428571428571,
                                                             0.6428571428571428,
                                                             0.6785714285714286,
                                                             0.7142857142857143,
                                             

In [11]:
import pickle
# save the model to disk
filename = 'LGBM_total - Best Params (tuned_LGBM_1).sav'
pickle.dump(tuned_LGBM_1, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

LGBMClassifier(boosting_type='goss', class_weight='balanced',
               learning_rate=0.020474575311902114, max_depth=9,
               n_estimators=750, objective='binary', random_state=42,
               reg_alpha=0.7142857142857142, reg_lambda=0.5714285714285714,
               subsample=0.5714285714285714)

In [12]:
%%time
# evaluate tuned_LGBM_1
df_tuned_LGBM_1, y_pred_proba_test = model_evaluation(model = tuned_LGBM_1, 
                                                      train = [X_train, y_train], 
                                                      test = X_test, 
                                                      name = 'tuned_LGBM_1')
display(df_tuned_LGBM_1)



confusion matrix and classification report "app_train": 
[[212535  70151]
 [  5755  19070]]
              precision    recall  f1-score   support

      repaid       0.97      0.75      0.85    282686
  not repaid       0.21      0.77      0.33     24825

    accuracy                           0.75    307511
   macro avg       0.59      0.76      0.59    307511
weighted avg       0.91      0.75      0.81    307511



Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train
tuned_LGBM_1,0.753,0.852 ± 0.0,0.786 ± 0.002,0.841


Wall time: 13min 44s


In [13]:
# create submission dataframe and save the submission to a csv file
df_submit = app_test[['SK_ID_CURR']].copy()
df_submit['TARGET'] = y_pred_proba_test
df_submit.to_csv('submission/tuned_LGBM_1.csv', index = False)
df_submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.283439
1,100005,0.610327
2,100013,0.175288
3,100028,0.22277
4,100038,0.665523


In [27]:
df_tuned_LGBM_1['ROC AUC test'] = round(0.78048, 3)
df_tuned_LGBM_1

Unnamed: 0,Accuracy train,ROC AUC 5-CV train,ROC AUC 5-CV validate,ROC AUC train,ROC AUC test
tuned_LGBM_1,0.753,0.852 ± 0.0,0.786 ± 0.002,0.841,0.78


- The `ROC AUC test` score is obtained after submit the submission file to [Kaggle](https://www.kaggle.com/competitions/home-credit-default-risk/overview) that is `0.78048`.

In [28]:
random_search_resulst = pd.DataFrame(random_LGBM_total.cv_results_)
random_search_resulst['mean_fit_time (minutes)'] = random_search_resulst['mean_fit_time']/60
random_search_resulst.to_csv('LGBM_total - RandomizedSearchCV (tuned_LGBM_1).csv')
random_search_resulst

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_reg_lambda,param_reg_alpha,param_n_estimators,param_max_depth,param_learning_rate,param_colsample_bytree,param_boosting_type,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time (minutes)
0,744.148437,7.330370,19.053357,7.037863,0.785714,0.285714,0.642857,450,8,0.009653,0.857143,goss,"{'subsample': 0.7857142857142857, 'reg_lambda'...",0.773538,0.773068,0.766720,0.775092,0.775634,0.772810,0.003190,43,12.402474
1,357.579878,90.849441,4.104028,1.028200,0.678571,0.071429,0.857143,550,1,0.032756,0.821429,gbdt,"{'subsample': 0.6785714285714286, 'reg_lambda'...",0.757020,0.754884,0.750114,0.757907,0.758295,0.755644,0.003007,64,5.959665
2,543.128824,14.856230,6.517208,2.275551,0.642857,0.0,0.642857,550,2,0.063243,0.857143,dart,"{'subsample': 0.6428571428571428, 'reg_lambda'...",0.757976,0.756721,0.751446,0.759887,0.760124,0.757231,0.003153,61,9.052147
3,1698.702279,41.183955,15.092955,3.990965,0.642857,0.642857,1.0,750,6,0.006034,0.75,dart,"{'subsample': 0.6428571428571428, 'reg_lambda'...",0.746761,0.748014,0.741231,0.752003,0.750606,0.747723,0.003736,75,28.311705
4,462.684289,14.108441,7.816407,0.669004,0.785714,0.857143,0.214286,700,8,0.5,0.678571,gbdt,"{'subsample': 0.7857142857142857, 'reg_lambda'...",0.706534,0.714582,0.708144,0.708753,0.717848,0.711172,0.004306,96,7.711405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,640.211760,11.580540,6.536031,0.506789,0.535714,0.357143,0.714286,350,5,0.024709,0.857143,dart,"{'subsample': 0.5357142857142857, 'reg_lambda'...",0.753752,0.754013,0.747058,0.757978,0.756928,0.753946,0.003812,65,10.670196
96,512.287591,6.786105,6.915830,2.320722,0.857143,0.642857,0.071429,250,9,0.016966,0.785714,dart,"{'subsample': 0.8571428571428571, 'reg_lambda'...",0.747115,0.748587,0.740854,0.751561,0.750553,0.747734,0.003769,74,8.538127
97,851.689922,22.787378,20.386395,2.408458,0.571429,0.571429,0.714286,750,9,0.020475,1.0,goss,"{'subsample': 0.5714285714285714, 'reg_lambda'...",0.786194,0.787451,0.780053,0.787047,0.788140,0.785777,0.002930,1,14.194832
98,525.575991,70.886081,7.945759,2.497636,0.607143,0.142857,0.857143,500,7,0.015444,1.0,goss,"{'subsample': 0.6071428571428571, 'reg_lambda'...",0.781007,0.780949,0.774990,0.781768,0.783310,0.780405,0.002838,18,8.759600


In [29]:
print('Total times after Random Search 5-CV with 20 candidates params grid: {} minutes'.format(random_search_resulst['mean_fit_time (minutes)'].sum()))

Total times after Random Search 5-CV with 20 candidates params grid: 909.6163997324309 minutes
