In [1]:
import datetime
import gc
import joblib
import json
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from imblearn.over_sampling import SMOTE
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [2]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [3]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [4]:
%load_ext autoreload
%autoreload

### Read Data

In [5]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v5/final_train_agg_v5.5.parquet")

Shape of data: (458913, 4015)
CPU times: user 13.7 s, sys: 12.9 s, total: 26.6 s
Wall time: 14.7 s


In [6]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [7]:
%%time
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 381 ms, sys: 2.16 s, total: 2.54 s
Wall time: 3.98 s


36

In [8]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_68_third_last', 'D_120_first', 'D_117_first', 'D_92_first', 'D_68_second_last', 'D_63_second_last', 'D_116_second_last', 'D_63_first', 'B_30_last', 'D_126_first', 'D_120_last', 'D_64_second_last', 'B_38_third_last', 'D_117_second_last', 'D_68_first', 'D_64_last', 'B_38_last', 'D_64_first', 'D_92_third_last', 'D_120_third_last', 'B_38_first', 'D_114_first', 'D_63_last', 'D_114_second_last', 'B_30_third_last', 'D_68_last', 'D_117_third_last', 'D_92_last', 'D_120_second_last', 'D_116_third_last', 'D_64_third_last', 'D_116_last', 'D_117_last', 'D_116_first', 'B_38_second_last', 'D_126_third_last', 'D_63_third_last', 'D_114_third_last', 'D_114_last', 'D_92_second_last']


In [9]:
train_agg.shape, target.shape

((458913, 4014), (458913,))

### Train LGBM using pre-set hyperparams

In [10]:
seed = 9

In [11]:
params = {
    'objective': 'binary',
    'metric': "binary_logloss",
    'boosting': 'dart',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 85,
    'learning_rate': 0.011,
    'feature_fraction': 0.195,
    'bagging_freq': 8,
    'bagging_fraction': 0.55,
    'n_jobs': -1,
    'lambda_l2': 15,
    'min_data_in_leaf': 75,
    'scale_pos_weight': 1.28,
    'max_bins': 255
}

In [12]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [14]:
# n_est = [9000, 9000, 6500, 7500, 7000]
n_est = [8500] * 5

In [15]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    if fold <= 2:
        continue
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 300,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 3 with 4014 features...
--------------------------------------------------
Start Training fold 3
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 513689
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 4000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258935 -> initscore=-1.051512
[LightGBM] [Info] Start training from score -1.051512
[500]	training's binary_logloss: 0.303624	training's amex: 0.773844	valid_1's binary_logloss: 0.30703	valid_1's amex: 0.765619
[1000]	training's binary_logloss: 0.249238	training's amex: 0.788008	valid_1's binary_logloss: 0.255561	valid_1's amex: 0.776167
[1500]	training's binary_logloss: 0.224394	training's amex: 0.801604	valid_1's binary_logloss: 0.234497	valid_1's amex: 

In [15]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 300,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0 with 4014 features...
--------------------------------------------------
Start Training fold 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 513544
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 4001
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.303691	training's amex: 0.774226	valid_1's binary_logloss: 0.30712	valid_1's amex: 0.766694
[1000]	training's binary_logloss: 0.249276	training's amex: 0.78819	valid_1's binary_logloss: 0.255459	valid_1's amex: 0.776585
[1500]	training's binary_logloss: 0.22447	training's amex: 0.800549	valid_1's binary_logloss: 0.234297	valid_1's amex: 0.

KeyboardInterrupt: 