In [1]:
import datetime
import gc
import joblib
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
import optuna
import os
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import Counter
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric,
    TreeExperiment
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [3]:
%load_ext autoreload
%autoreload

### Read Data

In [4]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v6/train_agg.parquet")

Shape of data: (458913, 5064)
CPU times: user 17.4 s, sys: 22.5 s, total: 39.9 s
Wall time: 26.7 s


In [5]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [6]:
# %%time
# train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
# gc.collect()

In [6]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['B_30_last', 'D_63_first', 'D_116_second_last', 'D_64_second_last', 'D_68_second_last', 'D_117_third_last', 'D_68_last', 'D_92_last', 'B_30_second_last', 'D_126_second_last', 'D_120_first', 'D_116_first', 'B_38_last', 'D_117_second_last', 'D_92_second_last', 'B_30_first', 'D_114_first', 'B_38_third_last', 'B_38_second_last', 'D_63_last', 'D_64_last', 'D_126_third_last', 'B_30_third_last', 'D_117_first', 'D_114_third_last', 'D_68_first', 'D_114_second_last', 'D_92_third_last', 'D_126_last', 'D_68_third_last', 'D_120_last', 'D_116_last', 'D_63_third_last', 'D_64_third_last', 'D_116_third_last', 'D_63_second_last', 'B_38_first', 'D_64_first', 'D_120_third_last', 'D_120_second_last', 'D_117_last', 'D_126_first', 'D_92_first', 'D_114_last']


In [7]:
# train_agg = pd.concat([train_agg, labels], axis=1)

In [8]:
train_agg.shape, target.shape

((458913, 5064), (458913,))

In [85]:
normal_predict_group_df = pd.read_csv(f"normal_predict_group.csv").drop(columns="target")
default_predict_group_df = pd.read_csv(f"default_predict_group.csv").drop(columns="target")

### Feature selection

In [21]:
%%time
lgbm_gbdt = TreeExperiment(
    exp_full_path="../../experiments/11.lgbm_dart_round_clip_7788",
    seed=7788, 
    model_path="gbdt_models"
)

CPU times: user 499 ms, sys: 276 ms, total: 775 ms
Wall time: 154 ms


In [22]:
fi = lgbm_gbdt.feature_imp_df

In [56]:
master = []
for i in range(5):
    master.extend(fi.nsmallest(1167, f"importance{i}")["feature"].tolist())

In [57]:
fi_dict = dict(Counter(master))

In [58]:
col_to_drop = [k for k, v in fi_dict.items() if v >= 5]

In [59]:
len(col_to_drop)

564

In [60]:
train_agg.shape

(458913, 5064)

In [61]:
%%time
train_agg = train_agg.drop(columns=col_to_drop)

CPU times: user 427 ms, sys: 3.36 s, total: 3.79 s
Wall time: 10 s


In [63]:
train_agg.shape

(458913, 4500)

In [64]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['B_30_last', 'D_63_first', 'D_64_second_last', 'D_68_second_last', 'D_117_third_last', 'D_68_last', 'D_92_last', 'B_30_second_last', 'D_126_second_last', 'D_120_first', 'B_38_last', 'D_117_second_last', 'D_92_second_last', 'D_114_first', 'B_38_third_last', 'D_63_last', 'B_38_second_last', 'D_64_last', 'D_126_third_last', 'B_30_third_last', 'D_117_first', 'D_114_third_last', 'D_68_first', 'D_114_second_last', 'D_126_last', 'D_68_third_last', 'D_120_last', 'D_63_third_last', 'D_64_third_last', 'D_63_second_last', 'B_38_first', 'D_64_first', 'D_120_third_last', 'D_120_second_last', 'D_117_last', 'D_126_first', 'D_92_first', 'D_114_last']


In [65]:
len(cat_features)

38

### REAL Stratify Split

In [86]:
%%time
normal_train_agg = train_agg.loc[labels["target"] == 0]
default_train_agg = train_agg.loc[labels["target"] == 1]

CPU times: user 1.35 s, sys: 4.13 s, total: 5.48 s
Wall time: 7.8 s


In [87]:
normal_train_agg = normal_train_agg.merge(normal_predict_group_df, on="customer_ID", how="left")
default_train_agg = default_train_agg.merge(default_predict_group_df, on="customer_ID", how="left")

In [88]:
# normal_train_agg = normal_train_agg.drop(columns=get_cols(normal_train_agg, "target"))
# default_train_agg = default_train_agg.drop(columns=get_cols(default_train_agg, "target"))

In [89]:
seed = 42

In [90]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [91]:
normal_indices = {}
for fold, (trn_ind, val_ind) in enumerate(kfold.split(normal_train_agg, normal_train_agg["group"])):
    normal_indices[fold] = (trn_ind, val_ind)

In [92]:
default_indices = {}
for fold, (trn_ind, val_ind) in enumerate(kfold.split(default_train_agg, default_train_agg["group"])):
    default_indices[fold] = (trn_ind, val_ind)

In [121]:
exp12_indices = {"normal": normal_indices, "default": default_indices}

In [130]:
kfold_indices = {}
for fold in range(5):
    a = normal_train_agg.loc[normal_indices[fold][1], "customer_ID"].tolist()
    b = default_train_agg.loc[default_indices[fold][1], "customer_ID"].tolist()
    kfold_indices[fold] = a + b

In [133]:
joblib.dump(kfold_indices, "./5fold_val_cid.pkl")

['./5fold_val_cid.pkl']

### Hyperparams Setting

In [114]:
params = {
    'objective': 'binary',
    'first_metric_only': True,
    'metric': "binary_logloss",
    'boosting': 'dart',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 96,
    'learning_rate': 0.01,
    'feature_fraction': 0.195,
    'bagging_freq': 10,
    'bagging_fraction': 0.5,
    'n_jobs': -1,
    'lambda_l2': 5,
    'min_data_in_leaf': 120,
    'scale_pos_weight': 1.28,
    'max_bins': 255,
    'feature_fraction_bynode': 0.9,
    'drop_rate': 0.09,
    'skip_drop': 0.55
}

In [115]:
n_est = [8000, 8000, 8000, 8000, 8000]

In [116]:
# train_ = train_.drop(columns=["customer_ID", "group"], errors="ignore")

In [117]:
not_in_x_columns = ['customer_ID', 'target', 'group']

In [118]:
for fold in range(5):
    
    n_estimator = n_est[fold]
    print(f"Number of estimator: {n_estimator}")
    xn = normal_train_agg.loc[normal_indices[fold][0]].drop(columns=not_in_x_columns)
    xd = default_train_agg.loc[default_indices[fold][0]].drop(columns=not_in_x_columns)
    x_train = pd.concat([xn, xd], ignore_index=True)
    print(f"X_train: {x_train.shape[0]}")
    xn = normal_train_agg.loc[normal_indices[fold][1]].drop(columns=not_in_x_columns)
    xd = default_train_agg.loc[default_indices[fold][1]].drop(columns=not_in_x_columns)
    x_val = pd.concat([xn, xd], ignore_index=True)
    print(f"X_val: {x_val.shape[0]}")
    
    yn = normal_train_agg.loc[normal_indices[fold][0], "target"]
    yd = default_train_agg.loc[default_indices[fold][0], "target"]
    y_train = pd.concat([yn, yd], ignore_index=True)
    print(f"Y_train: {y_train.shape[0]}")
    
    yn = normal_train_agg.loc[normal_indices[fold][1], "target"]
    yd = default_train_agg.loc[default_indices[fold][1], "target"]
    y_val = pd.concat([yn, yd], ignore_index=True)
    print(f"Y_validation: {y_val.shape[0]}")
    
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./dart_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

Number of estimator: 8000
X_train: 367130
X_val: 91783
Y_train: 367130
Y_validation: 91783
Start Training fold 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 566106
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 4487
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.288424	training's amex: 0.780779	valid_1's binary_logloss: 0.294466	valid_1's amex: 0.764587
[1000]	training's binary_logloss: 0.230785	training's amex: 0.798886	valid_1's binary_logloss: 0.241922	valid_1's amex: 0.776767
[1500]	training's binary_logloss: 0.215633	training's amex: 0.813358	valid_1's binary_logloss: 0.231175	valid_1's amex: 0.782011
[2000]	training's binary_logloss: 0.20512

In [None]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    if fold < 3:
        continue
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./dart_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 3 with 3983 features...
--------------------------------------------------
Start Training fold 3
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 553242
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 3983
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258935 -> initscore=-1.051512
[LightGBM] [Info] Start training from score -1.051512
[500]	training's binary_logloss: 0.265894	training's amex: 0.784579	valid_1's binary_logloss: 0.270852	valid_1's amex: 0.776311
[1000]	training's binary_logloss: 0.220583	training's amex: 0.804852	valid_1's binary_logloss: 0.231342	valid_1's amex: 0.786518
[1500]	training's binary_logloss: 0.205965	training's amex: 0.821195	valid_1's binary_logloss: 0.22299	valid_1's amex: 