In [40]:
import datetime
import gc
import joblib
import json
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import Counter
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [19]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [20]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric,
    TreeExperiment
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [21]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read Data

In [5]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v6/train_agg.parquet")

Shape of data: (458913, 5064)
CPU times: user 17.5 s, sys: 19 s, total: 36.5 s
Wall time: 26.1 s


In [6]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [7]:
%%time
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 444 ms, sys: 2.33 s, total: 2.78 s
Wall time: 3.9 s


0

In [8]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_68_last', 'D_116_last', 'D_64_last', 'D_120_last', 'B_38_first', 'D_114_third_last', 'D_116_second_last', 'D_63_third_last', 'D_92_second_last', 'D_114_first', 'D_68_second_last', 'D_120_first', 'B_38_last', 'D_117_last', 'D_126_last', 'D_120_third_last', 'D_63_first', 'D_116_first', 'D_92_first', 'D_117_first', 'D_126_second_last', 'B_30_first', 'D_63_second_last', 'D_120_second_last', 'B_30_second_last', 'D_64_second_last', 'B_38_third_last', 'B_30_last', 'D_63_last', 'B_38_second_last', 'D_116_third_last', 'D_126_first', 'D_117_third_last', 'B_30_third_last', 'D_92_third_last', 'D_114_second_last', 'D_114_last', 'D_68_third_last', 'D_92_last', 'D_64_first', 'D_117_second_last', 'D_126_third_last', 'D_64_third_last', 'D_68_first']


In [9]:
train_agg.shape, target.shape

((458913, 5062), (458913,))

In [10]:
seed = 7788

### Use short experiment to determine suitable features

In [11]:
params = {
    'objective': 'binary',
    'metric': "binary_logloss",
    'boosting': 'gbdt',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 80,
    'learning_rate': 0.02,
    'feature_fraction': 0.20,
    'bagging_freq': 10,
    'bagging_fraction': 0.5,
    'n_jobs': -1,
    'lambda_l2': 10,
    'min_data_in_leaf': 100,
    'scale_pos_weight': 1.3,
    'max_bins': 127
}

In [12]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [13]:
n_est = [1800] * 5

In [14]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 200,
        verbose_eval = 100,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./gbdt_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0 with 5062 features...
--------------------------------------------------
Start Training fold 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 350388
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 5022
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.256362	training's amex: 0.775292	valid_1's binary_logloss: 0.262517	valid_1's amex: 0.762277
[200]	training's binary_logloss: 0.222455	training's amex: 0.794466	valid_1's binary_logloss: 0.233136	valid_1's amex: 0.775488
[300]	training's binary_logloss: 0.210922	training's amex:

### Feature selection

In [24]:
%%time
lgbm_gbdt = TreeExperiment(
    exp_full_path="../../experiments/11.lgbm_dart_round_clip_7788",
    seed=7788, 
    model_path="gbdt_models"
)

CPU times: user 432 ms, sys: 204 ms, total: 636 ms
Wall time: 128 ms


In [26]:
fi = lgbm_gbdt.feature_imp_df

In [71]:
master = []
for i in range(5):
    master.extend(fi.nsmallest(1700, f"importance{i}")["feature"].tolist())

In [72]:
fi_dict = dict(Counter(master))

In [75]:
col_to_drop = [k for k, v in fi_dict.items() if v >= 5]

In [76]:
len(col_to_drop)

1079

In [74]:
train_agg.shape

(458913, 5062)

In [77]:
%%time
train_agg = train_agg.drop(columns=col_to_drop)

CPU times: user 403 ms, sys: 3.14 s, total: 3.54 s
Wall time: 8.97 s


In [80]:
train_agg.shape

(458913, 3983)

In [85]:
len(cat_features)

44

In [89]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_68_last', 'D_64_last', 'D_120_last', 'B_38_first', 'D_114_third_last', 'D_63_third_last', 'D_114_first', 'D_68_second_last', 'D_120_first', 'B_38_last', 'D_117_last', 'D_126_last', 'D_120_third_last', 'D_63_first', 'D_92_first', 'D_117_first', 'D_126_second_last', 'D_63_second_last', 'D_120_second_last', 'B_30_second_last', 'D_64_second_last', 'B_38_third_last', 'B_30_last', 'D_63_last', 'B_38_second_last', 'D_126_first', 'D_117_third_last', 'B_30_third_last', 'D_114_second_last', 'D_114_last', 'D_68_third_last', 'D_64_first', 'D_117_second_last', 'D_126_third_last', 'D_64_third_last', 'D_68_first']


In [90]:
len(cat_features)

36

### Train LGBM using pre-set hyperparams

In [120]:
params = {
    'objective': 'binary',
    'first_metric_only': True,
    'metric': "binary_logloss",
    'boosting': 'dart',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 93,
    'learning_rate': 0.0115,
    'feature_fraction': 0.195,
    'bagging_freq': 10,
    'bagging_fraction': 0.5,
    'n_jobs': -1,
    'lambda_l2': 5,
    'min_data_in_leaf': 110,
    'scale_pos_weight': 1.3,
    'max_bins': 255,
    'feature_fraction_bynode': 0.9,
    'drop_rate': 0.075,
    'skip_drop': 0.6
}

In [121]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [122]:
n_est = [6000, 6500, 5500, 6000, 5500]
# n_est = [9500] * 5

In [123]:
gc.collect()

2431

In [None]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    if fold < 4:
        continue
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./dart_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 4 with 3983 features...
--------------------------------------------------
Start Training fold 4
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 553010
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 3983
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258935 -> initscore=-1.051512
[LightGBM] [Info] Start training from score -1.051512
[500]	training's binary_logloss: 0.262141	training's amex: 0.786064	valid_1's binary_logloss: 0.26857	valid_1's amex: 0.769614


In [96]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    if fold == 0:
        continue
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0 with 3983 features...
--------------------------------------------------
Start Training fold 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 553126
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 3983
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.38076	training's amex: 0.769573	valid_1's binary_logloss: 0.384457	valid_1's amex: 0.756352
[1000]	training's binary_logloss: 0.278647	training's amex: 0.783586	valid_1's binary_logloss: 0.286052	valid_1's amex: 0.766755
[1500]	training's binary_logloss: 0.242993	training's amex: 0.797213	valid_1's binary_logloss: 0.254021	valid_1's amex: 

FileNotFoundError: [Errno 2] No such file or directory: './models/model_fold0_seed7788.pkl'

In [None]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 300,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()