In [1]:
import datetime
import gc
import joblib
import json
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from imblearn.over_sampling import SMOTE
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [2]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [3]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [4]:
%load_ext autoreload
%autoreload

### Read Data

In [5]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v4/final_train_agg.parquet")

Shape of data: (458913, 2906)
CPU times: user 10.7 s, sys: 10 s, total: 20.8 s
Wall time: 12.7 s


In [6]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [7]:
%%time
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 298 ms, sys: 1.6 s, total: 1.9 s
Wall time: 2.77 s


36

In [8]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['B_binaries_last', 'D_92_first', 'D_114_third_last', 'R_binaries_second_last', 'B_binaries_second_last', 'S_binaries_first', 'D_64_last', 'D_116_last', 'D_68_third_last', 'B_38_second_last', 'S_binaries_third_last', 'D_116_third_last', 'D_92_second_last', 'D_116_second_last', 'D_117_last', 'B_30_third_last', 'B_38_third_last', 'D_117_second_last', 'D_114_second_last', 'S_binaries_last', 'R_binaries_last', 'B_38_first', 'R_binaries_third_last', 'B_38_last', 'D_126_third_last', 'D_68_last', 'D_68_second_last', 'D_114_last', 'D_120_last', 'D_120_third_last', 'D_64_second_last', 'D_116_first', 'R_binaries_first', 'D_68_first', 'D_117_first', 'B_binaries_third_last', 'D_63_third_last', 'D_92_third_last', 'D_64_third_last', 'S_binaries_second_last', 'D_92_last', 'D_117_third_last', 'B_binaries_first']


In [9]:
train_agg.shape, target.shape

((458913, 2905), (458913,))

### Train LGBM using pre-set hyperparams

In [10]:
seed = 1020

In [11]:
params = {
    'objective': 'binary',
    'metric': "binary_logloss",
    'boosting': 'dart',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 83,
    'learning_rate': 0.012,
    'feature_fraction': 0.19,
    'bagging_freq': 9,
    'bagging_fraction': 0.55,
    'n_jobs': -1,
    'lambda_l2': 15,
    'min_data_in_leaf': 100,
    'scale_pos_weight': 1.4,
    'max_bins': 255
}

In [12]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [13]:
n_est = [9000, 9000, 6500, 7500, 7000]

In [15]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    n_estimator = n_est[fold]
    if fold < 3:
        continue
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 300,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 3 with 2905 features...
--------------------------------------------------
Start Training fold 3
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 533589
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 2902
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258935 -> initscore=-1.051512
[LightGBM] [Info] Start training from score -1.051512
[500]	training's binary_logloss: 0.291784	training's amex: 0.775727	valid_1's binary_logloss: 0.295307	valid_1's amex: 0.765145
[1000]	training's binary_logloss: 0.24252	training's amex: 0.790861	valid_1's binary_logloss: 0.249404	valid_1's amex: 0.775686
[1500]	training's binary_logloss: 0.221334	training's amex: 0.80462	valid_1's binary_logloss: 0.232239	valid_1's amex: 0

In [15]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    n_estimator = n_est[fold]
    if fold == 0:
        continue
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 300,
        verbose_eval = 1000,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 1 with 2905 features...
--------------------------------------------------
Start Training fold 1
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905033
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 2904
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[1000]	training's binary_logloss: 0.24773	training's amex: 0.788517	valid_1's binary_logloss: 0.254957	valid_1's amex: 0.775052
[2000]	training's binary_logloss: 0.21364	training's amex: 0.814003	valid_1's binary_logloss: 0.228886	valid_1's amex: 0.785646
[3000]	training's binary_logloss: 0.199275	training's amex: 0.83222	valid_1's binary_logloss: 0.223007	valid_1's amex: 0

KeyboardInterrupt: 

In [24]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = 6000,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 300,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    # # Add to out of folds array
    # oof_predictions[val_ind] = val_pred
    # Predict the test set
    # test_pred = model.predict(test[features])
    # test_predictions += test_pred / CFG.n_folds
    # Compute fold metric
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0 with 2273 features...
--------------------------------------------------
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 484494
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 2273
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.247401	training's amex: 0.784738	valid_1's binary_logloss: 0.256278	valid_1's amex: 0.771125
[1000]	training's binary_logloss: 0.219938	training's amex: 0.804609	valid_1's binary_logloss: 0.23433	valid_1's amex: 0.781003
[1500]	training's binary_logloss: 0.206257	training's amex: 0.821338	valid_1's binary_logloss: 0.2271	valid_1's amex: 0.788323
[2000]	training

Exception ignored in: <function Booster.__del__ at 0x14f9de170>
Traceback (most recent call last):
  File "/Users/wklee/miniconda3/envs/amex/lib/python3.10/site-packages/lightgbm/basic.py", line 2664, in __del__
    _safe_call(_LIB.LGBM_BoosterFree(self.handle))
KeyboardInterrupt: 

KeyboardInterrupt

