In [6]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25ldone
[?25h  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=9ce4dcb9d96e21662e6bbcc262a95298c60a107b182f3f49434400ef9c2627ad
  Stored in directory: /home/notebook/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 2.3 MB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.11.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb

from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import display
from bayes_opt import BayesianOptimization



from sklearn import metrics
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', 200)

# pd.options.display.max_rows = 10000
# pd.options.display.max_columns = 1000
# pd.options.display.max_colwidth = 1000

In [8]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [10]:
### Load Data
df = pd.read_pickle('/home/work/toy-project/data/df_preprocessesd.pkl')


In [11]:
df['issue_d'] = pd.to_datetime(df['issue_d'])
df['issue_year'] = df['issue_d'].dt.year
df.loc[df['issue_year'] == 2007 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2008 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2009 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2010 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2011 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2012 , 'issue_year'] = 201200


In [12]:
train = df[df['issue_year'] != 2018]
test = df[df['issue_year'] == 2018]

In [13]:
answer = test[['id','loan_status']]
# test.drop('loan_status',1, inplace = True)

In [14]:
# Load Data with selected features
X = train.copy()
# Labels
y = train['loan_status']

# Remove Labels from Dataframe
X.drop(['loan_status'], axis = 1, inplace = True)
# Final Data Shapes
print(X.shape)
print(y.shape)

(1314290, 104)
(1314290,)


In [15]:
ttrain = train[train['issue_year'] != 2017]
ttest = train[train['issue_year'] == 2017]

In [16]:
X_train = ttrain.copy()
y_train = ttrain['loan_status']
X_train.drop(['loan_status'], axis = 1, inplace = True)

X_test = ttest.copy()
y_test = ttest['loan_status']
X_test.drop(['loan_status'], axis = 1, inplace = True)

In [17]:
remove_features = ['earliest_cr_line', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d', 'next_pymnt_d',
                  'initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee',
                   'recoveries','collection_recovery_fee','last_pymnt_amnt','last_fico_range_high','last_fico_range_low' , 'id', 'loan_status' , 'issue_year']
features  = [col for col in list(train) if col not in remove_features]

In [18]:
X_train = X_train[features]
X_test = X_test[features]

## Bayesian Optimization

In [19]:
bounds = {
    'num_leaves': (50, 70), 
    'subsample' : (0.5, 1),
    'colsample_bytree' : (0.5, 1),
    'max_bin' : (49, 69) }

In [20]:
def train_model( num_leaves, colsample_bytree, subsample , max_bin):
    
    params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.05,
                    'num_leaves': int(num_leaves), 
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree' : colsample_bytree,
                    'subsample_freq':1,
                    'subsample' : subsample,
                    'n_estimators':10000,
                    'max_bin':int(max_bin),
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':50, 
        
          }
    
    trn_data = lgb.Dataset(X_train, y_train)
    val_data = lgb.Dataset(X_test, y_test)
    model = lgb.train(params, trn_data, 5000, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds=50)
    # Returning negative rmse because optimizer tries to maximize a function
    return model.best_score['valid_1']['auc']

In [None]:
optimizer = BayesianOptimization(f=train_model, pbounds=bounds, random_state=2021)


optimizer.maximize(init_points=10, n_iter=10)

|   iter    |  target   | colsam... |  max_bin  | num_le... | subsample |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7508  [0m | [0m 0.803   [0m | [0m 63.67   [0m | [0m 52.78   [0m | [0m 0.6563  [0m |
| [0m 2       [0m | [0m 0.7503  [0m | [0m 0.9986  [0m | [0m 51.56   [0m | [0m 53.58   [0m | [0m 0.8765  [0m |
| [0m 3       [0m | [0m 0.75    [0m | [0m 0.8311  [0m | [0m 64.69   [0m | [0m 51.94   [0m | [0m 0.5293  [0m |
| [0m 4       [0m | [0m 0.7506  [0m | [0m 0.9812  [0m | [0m 61.33   [0m | [0m 51.73   [0m | [0m 0.7806  [0m |
| [0m 5       [0m | [0m 0.7506  [0m | [0m 0.8083  [0m | [0m 68.28   [0m | [0m 61.49   [0m | [0m 0.6856  [0m |
| [0m 6       [0m | [0m 0.7504  [0m | [0m 0.7261  [0m | [0m 53.04   [0m | [0m 61.39   [0m | [0m 0.5975  [0m |
| [0m 7       [0m | [0m 0.75    [0m | [0m 0.7919  [0m | [0m 58.53   [0m | [0m 60.36   [0m | [0m 0.9115  [0m 

In [None]:
optimizer.max['params']

In [None]:
lgb_params = {        
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.05,
                    'num_leaves': int(optimizer.max['params']['num_leaves']), 
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree' : optimizer.max['params']['colsample_bytree'],
                    'subsample_freq':1,
                    'subsample' : optimizer.max['params']['subsample'],
                    'n_estimators':10000,
                    'max_bin':int(optimizer.max['params']['max_bin']),
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
    
}

In [None]:
y_train = train.pop('loan_status')
y_test = test.pop('loan_status')
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
remove_features = ['earliest_cr_line', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d', 'next_pymnt_d',
                  'initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee',
                   'recoveries','collection_recovery_fee','last_pymnt_amnt','last_fico_range_high','last_fico_range_low' , 'id', 'loan_status' , 'issue_year']
features  = [col for col in list(train) if col not in remove_features]

In [None]:
def make_lgb_prediction(train, y, test, features, categorical_features='auto', model_params=None, folds=5):
#     def lgb_f1_score(y_hat, data):
#         y_true = data.get_label()
#         y_hat = np.round(y_hat) 
#         return 'f1', f1_score(y_true, y_hat, average='binary'), True
    
    skf = KFold(n_splits=folds, random_state=SEED, shuffle=True)
    folds = 5
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0
    split_groups = train['issue_year']
    feature_importance = pd.DataFrame()
    

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)

        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            categorical_feature=categorical_features,
            verbose_eval=200,
            early_stopping_rounds=100,
#             feval=lgb_f1_score
        )

        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold + 1
        feature_importance = pd.concat([feature_importance, fold_importance_df], axis=0)
        
        y_pred_val = clf.predict(x_val)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | AUC Score: {roc_auc_score(y_val, y_pred_val)}")

        score += roc_auc_score(y_val, y_pred_val) / folds
        y_preds += clf.predict(x_test) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean AUC score = {score}")
    print(f"OOF AUC score = {roc_auc_score(y, y_oof)}")
    
    return y_oof, y_preds, feature_importance

In [None]:
y_oof_lgb, y_preds_lgb, fi_lgb = make_lgb_prediction(train, y_train, test, features, model_params=lgb_params)

In [None]:
print(f"TEST AUC score = {roc_auc_score(answer['loan_status'], y_preds_lgb)}")

## Feature Importance

In [None]:
cols = (fi_lgb[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = fi_lgb.loc[fi_lgb.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')