In [4]:
# ##import
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb

from IPython.display import display
from bayes_opt import BayesianOptimization



from sklearn import metrics
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', 200)

In [1]:
###load data

In [5]:
# Load Data with selected features
X = pd.read_csv('./data/lgb_selected_default.csv')
# Targets
y = X['loan_status']

# Remove Labels from Dataframe
X.drop(['loan_status'], axis = 1, inplace = True)
# Final Data Shapes
print(X.shape)
print(y.shape)

(2260668, 41)
(2260668,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 2020)

## Bayesian Optimization

In [10]:
bounds = {
    'learning_rate': (0.002, 0.2),
    'num_leaves': (50, 500), 
    'bagging_fraction' : (0.1, 1),
    'feature_fraction' : (0.1, 1),
    'min_child_weight': (0.001, 0.5),   
    'min_data_in_leaf': (20, 170),
    'max_depth': (15, 20),
    'reg_alpha': (0.1, 2), 
    'reg_lambda': (0.1, 2)
}

In [11]:
def train_model(learning_rate, 
                num_leaves,
                bagging_fraction, 
                feature_fraction, 
                min_child_weight,
                min_data_in_leaf,
                max_depth,
                reg_alpha,
                reg_lambda):
    
    params = {'learning_rate': learning_rate,
              'num_leaves': int(num_leaves), 
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
              'min_child_weight': min_child_weight,   
              'min_data_in_leaf': int(min_data_in_leaf),
              'max_depth': int(max_depth),
              'reg_alpha': reg_alpha, 
              'reg_lambda': reg_lambda,
              'objective': 'binary',
              'boosting_type': 'gbdt',
              'random_state': 2021,
              'verbosity': -1,
              'metric': 'auc'}
    
    trn_data = lgb.Dataset(X_train, y_train)
    val_data = lgb.Dataset(X_test, y_test)
    model = lgb.train(params, trn_data, 5000, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds=50)
    # Returning auc score
    return model.best_score['valid_1']['auc']

In [12]:
optimizer = BayesianOptimization(f=train_model, pbounds=bounds, random_state=2021)


optimizer.maximize(init_points=10, n_iter=10)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_da... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9968  [0m | [0m 0.6454  [0m | [0m 0.76    [0m | [0m 0.02951 [0m | [0m 16.56   [0m | [0m 0.4986  [0m | [0m 39.22   [0m | [0m 130.5   [0m | [0m 1.531   [0m | [0m 1.358   [0m |
| [0m 2       [0m | [0m 0.9929  [0m | [0m 0.8059  [0m | [0m 0.1872  [0m | [0m 0.0136  [0m | [0m 19.81   [0m | [0m 0.3087  [0m | [0m 32.99   [0m | [0m 302.6   [0m | [0m 1.271   [0m | [0m 1.931   [0m |
| [0m 3       [0m | [0m 0.9967  [0m | [0m 0.6169  [0m | [0m 0.434   [0m | [0m 0.09152 [0m | [0m 16.01   [0m | [0m 0.2851  [0m | [0m 49.26   [0m | [0m 312.7   [0m | [0m 1.005   [0m | [0m 1.084   [0m |
| [95m 4       [0m | [95m 0.9968  [0m | [95m 0.8408  [0m | 

In [13]:
optimizer.max['params']

{'bagging_fraction': 0.1352596638277066,
 'feature_fraction': 0.7469934533565048,
 'learning_rate': 0.01084443459402704,
 'max_depth': 19.635409113512498,
 'min_child_weight': 0.30275249989439823,
 'min_data_in_leaf': 159.1573696978958,
 'num_leaves': 130.56800786083414,
 'reg_alpha': 0.7309683176011929,
 'reg_lambda': 1.3332030058536595}

In [14]:
### Use optimized parameters as parameter for model
lgb_params = {'num_leaves': int(optimizer.max['params']['num_leaves']),
              'bagging_fraction' : optimizer.max['params']['bagging_fraction'],
              'feature_fraction' : optimizer.max['params']['feature_fraction'],
              'learning_rate' :  optimizer.max['params']['learning_rate'],
              'max_depth' : int(optimizer.max['params']['max_depth']),
              'min_child_weight' :  optimizer.max['params']['min_child_weight'],
              'min_data_in_leaf' : int(optimizer.max['params']['min_data_in_leaf']),
              'num_leaves' : int(optimizer.max['params']['num_leaves']),
              'reg_alpha': optimizer.max['params']['reg_alpha'],
              'reg_lambda' : optimizer.max['params']['reg_lambda'],
            'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 10,
        'metric': 'auc'
              
    
}

## Make Real Prediction (Params from Bayesian Optimization)

In [15]:
df = pd.read_pickle('../input/lending-club-feature-selection/lgb_selected.pkl')

train , test = train_test_split(df,test_size = 0.2, random_state = 2020)
y_train = train.pop('loan_status')
y_test = test.pop('loan_status')
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

remove_features = ['id','loan_status']
features = [col for col in list(train) if col not in remove_features]

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [16]:
def make_lgb_prediction(train, y, test, features, categorical_features='auto', model_params=None, folds=5):
    def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) 
        return 'f1', f1_score(y_true, y_hat, average='binary'), True
    
    skf = KFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = lgb.Dataset(x_tr, label=y_tr)
        dvalid = lgb.Dataset(x_val, label=y_val)

        clf = lgb.train(
            model_params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            categorical_feature=categorical_features,
            verbose_eval=200,
            early_stopping_rounds=100,
            feval=lgb_f1_score
        )

        feature_importance[f'fold_{fold+1}'] = clf.feature_importance()

        y_pred_val = clf.predict(x_val)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict(x_test) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [17]:
y_oof_lgb, y_preds_lgb, fi_lgb = make_lgb_prediction(train, y_train, test, features, model_params=lgb_params)

Fold: 1
(1446827, 40) (361707, 40)
[LightGBM] [Info] Number of positive: 189108, number of negative: 1257719
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.915726
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.228427
[LightGBM] [Debug] init for col-wise cost 0.095866 seconds, init for row-wise cost 0.815909 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6678
[LightGBM] [Info] Number of data points in the train set: 1446827, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.130705 -> initscore=-1.894737
[LightGBM] [Info] Start training from score -1.894737
[LightGBM] [Debug] Trained a tree with leaves = 130 and max_depth = 14
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 130 and max_depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 130 and max_depth = 17
[LightGBM] [Debug] Trained a tree with