In [None]:
import datetime
import gc
import joblib
import json
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import Counter
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [None]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [None]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric,
    TreeExperiment
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [None]:
%load_ext autoreload
%autoreload

### Read Data

In [35]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v7/train_agg.parquet")
train_agg_diff = read_file(f"../{PROCESSED_DATA_PATH}/v7/train_agg_diff_selected.parquet")

Shape of data: (458913, 5064)
Shape of data: (458913, 438)
CPU times: user 1min, sys: 2min 56s, total: 3min 57s
Wall time: 2min 48s


In [36]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [37]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_117_third_last', 'D_116_third_last', 'D_63_third_last', 'D_117_first', 'D_116_first', 'D_92_first', 'D_114_third_last', 'D_116_second_last', 'B_30_last', 'D_120_third_last', 'D_126_first', 'B_38_first', 'D_64_second_last', 'D_120_second_last', 'D_63_first', 'D_114_second_last', 'D_116_last', 'D_64_first', 'B_38_second_last', 'D_92_second_last', 'D_63_second_last', 'D_63_last', 'B_38_last', 'B_30_second_last', 'D_64_last', 'D_120_first', 'D_92_last', 'D_114_last', 'D_92_third_last', 'D_126_third_last', 'D_68_first', 'D_117_second_last', 'D_64_third_last', 'D_120_last', 'D_126_second_last', 'D_68_second_last', 'B_38_third_last', 'B_30_first', 'D_117_last', 'B_30_third_last', 'D_114_first', 'D_68_third_last', 'D_68_last', 'D_126_last']


In [38]:
train_agg_diff = train_agg_diff.rename(columns={k: k + "_diff" for k in train_agg_diff.columns})

In [39]:
train_agg.shape, train_agg_diff.shape, target.shape

((458913, 5064), (458913, 438), (458913,))

In [40]:
train_agg = pd.concat([train_agg, train_agg_diff], axis=1)

In [41]:
train_agg.shape

(458913, 5502)

In [42]:
# train_agg = train_agg.rename(columns={k: k.replace("_ori", "") for k in train_agg.columns})

In [43]:
# train_agg_diff.iloc[:, 1:] = train_agg_diff.iloc[:, 1:].rename(columns={k: k + "_diff" for k in train_agg_diff.columns})

In [44]:
model = joblib.load("model_fold0_seed7788_diff_only.pkl")

In [45]:
imp_df = pd.DataFrame(dict(feature=model.feature_name(), imp=model.feature_importance()))
imp_df = imp_df.sort_values(by="imp", ascending=False)

In [76]:
dropped_features = imp_df.loc[imp_df["imp"] < 20].feature.tolist()

In [77]:
len(dropped_features)

1363

In [79]:
train_agg = train_agg.drop(columns=dropped_features, errors="ignore")

In [80]:
train_agg.shape, target.shape

((458913, 4137), (458913,))

In [81]:
seed = 7788

### Feature selection

In [13]:
# %%time
# lgbm_gbdt = TreeExperiment(
#     exp_full_path="../../experiments/11.lgbm_dart_round_clip_7788",
#     seed=7788, 
#     model_path="gbdt_models"
# )

In [14]:
# fi = lgbm_gbdt.feature_imp_df

In [None]:
master = []
for i in range(5):
    master.extend(fi.nsmallest(1700, f"importance{i}")["feature"].tolist())

In [None]:
fi_dict = dict(Counter(master))

In [None]:
col_to_drop = [k for k, v in fi_dict.items() if v >= 5]

In [None]:
len(col_to_drop)

In [None]:
train_agg.shape

In [None]:
%%time
train_agg = train_agg.drop(columns=col_to_drop)

In [15]:
train_agg.shape

(458913, 3461)

In [16]:
len(cat_features)

0

### Train LGBM using pre-set hyperparams

In [83]:
params = {
    'objective': 'binary',
    'metric': "binary_logloss",
    'boosting': 'dart',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 90,
    'learning_rate': 0.011,
    'feature_fraction': 0.19,
    'bagging_freq': 8,
    'bagging_fraction': 0.55,
    'n_jobs': -1,
    'lambda_l2': 15,
    'min_data_in_leaf': 75,
    'scale_pos_weight': 1.4,
    'max_bins': 255,
    'feature_fraction_bynode': 0.9,
    'drop_rate': 0.1,
    'skip_drop': 0.55
}

In [84]:
# params = {
#     'objective': 'binary',
#     'first_metric_only': True,
#     'metric': "binary_logloss",
#     'boosting': 'dart',
#     'device': "cpu",
#     'seed': seed,
#     'num_leaves': 100,
#     'learning_rate': 0.01,
#     'feature_fraction': 0.19,
#     'bagging_freq': 10,
#     'bagging_fraction': 0.5,
#     'n_jobs': -1,
#     'lambda_l2': 5,
#     'min_data_in_leaf': 125,
#     'scale_pos_weight': 1.3,
#     'max_bins': 255,
#     'feature_fraction_bynode': 0.95,
#     'drop_rate': 0.11,
#     'skip_drop': 0.6
# }

In [85]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [86]:
n_est = [7500] * 5
# n_est = [9500] * 5

In [87]:
gc.collect()

1572

In [88]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_117_third_last', 'D_116_third_last', 'D_63_third_last', 'D_117_first', 'D_116_first', 'D_92_first', 'D_114_third_last', 'D_116_second_last', 'B_30_last', 'D_120_third_last', 'D_126_first', 'B_38_first', 'D_64_second_last', 'D_120_second_last', 'D_63_first', 'D_114_second_last', 'D_116_last', 'D_64_first', 'B_38_second_last', 'D_92_second_last', 'D_63_second_last', 'D_63_last', 'B_38_last', 'B_30_second_last', 'D_64_last', 'D_120_first', 'D_92_last', 'D_114_last', 'D_92_third_last', 'D_126_third_last', 'D_68_first', 'D_117_second_last', 'D_64_third_last', 'D_120_last', 'D_126_second_last', 'D_68_second_last', 'B_38_third_last', 'B_30_first', 'D_117_last', 'B_30_third_last', 'D_114_first', 'D_68_third_last', 'D_68_last', 'D_126_last']


In [89]:
len(cat_features)

44

In [90]:
train_agg.shape

(458913, 4137)

In [91]:
%%time
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 4.05 s, sys: 18 s, total: 22.1 s
Wall time: 31.8 s


72

In [95]:
train_agg.to_parquet(f"../{EXP_PATH}/13.lgbm_dart_round_clip_diff_7788/final_train.parquet")

In [92]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./dart_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0 with 4137 features...
--------------------------------------------------
Start Training fold 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 617831
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 4129
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.31147	training's amex: 0.774339	valid_1's binary_logloss: 0.316384	valid_1's amex: 0.760052
[1000]	training's binary_logloss: 0.230813	training's amex: 0.796697	valid_1's binary_logloss: 0.241597	valid_1's amex: 0.776241
[1500]	training's binary_logloss: 0.21517	training's amex: 0.811194	valid_1's binary_logloss: 0.230816	valid_1's amex: 0.782153
[2000]	training's binary_logloss: 0.205994	training's am

### Process Test

In [None]:
%%time
test_agg = read_file(f"../{PROCESSED_DATA_PATH}/v7/test_agg.parquet")
test_agg_diff = read_file(f"../{PROCESSED_DATA_PATH}/v7/test_agg_diff_selected.parquet")

In [None]:
test_agg_diff = test_agg_diff.rename(columns={k: k + "_diff" for k in test_agg_diff.columns})

In [None]:
test_agg.shape, test_agg_diff.shape

In [None]:
test_agg = pd.concat([test_agg, test_agg_diff], axis=1)

In [None]:
del test_agg_diff

In [None]:
test_agg.shape

In [None]:
model = joblib.load("model_fold0_seed7788_diff_only.pkl")

In [None]:
imp_df = pd.DataFrame(dict(feature=model.feature_name(), imp=model.feature_importance()))
imp_df = imp_df.sort_values(by="imp", ascending=False)

In [None]:
dropped_features = imp_df.loc[imp_df["imp"] < 20].feature.tolist()

In [None]:
len(dropped_features)

In [None]:
test_agg = test_agg.drop(columns=dropped_features, errors="ignore")

In [None]:
test_agg.shape

In [None]:
seed = 7788

In [None]:
test_agg.to_parquet(f"../{EXP_PATH}/13.lgbm_dart_round_clip_diff_7788/final_test.parquet")

### Feature selection

In [None]:
# %%time
# lgbm_gbdt = TreeExperiment(
#     exp_full_path="../../experiments/11.lgbm_dart_round_clip_7788",
#     seed=7788, 
#     model_path="gbdt_models"
# )

In [None]:
# fi = lgbm_gbdt.feature_imp_df

In [None]:
master = []
for i in range(5):
    master.extend(fi.nsmallest(1700, f"importance{i}")["feature"].tolist())

In [None]:
fi_dict = dict(Counter(master))

In [None]:
col_to_drop = [k for k, v in fi_dict.items() if v >= 5]

In [None]:
len(col_to_drop)

In [None]:
train_agg.shape

In [None]:
%%time
train_agg = train_agg.drop(columns=col_to_drop)

In [15]:
train_agg.shape

(458913, 3461)

In [16]:
len(cat_features)

0

### Train LGBM using pre-set hyperparams

In [83]:
params = {
    'objective': 'binary',
    'metric': "binary_logloss",
    'boosting': 'dart',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 90,
    'learning_rate': 0.011,
    'feature_fraction': 0.19,
    'bagging_freq': 8,
    'bagging_fraction': 0.55,
    'n_jobs': -1,
    'lambda_l2': 15,
    'min_data_in_leaf': 75,
    'scale_pos_weight': 1.4,
    'max_bins': 255,
    'feature_fraction_bynode': 0.9,
    'drop_rate': 0.1,
    'skip_drop': 0.55
}

In [84]:
# params = {
#     'objective': 'binary',
#     'first_metric_only': True,
#     'metric': "binary_logloss",
#     'boosting': 'dart',
#     'device': "cpu",
#     'seed': seed,
#     'num_leaves': 100,
#     'learning_rate': 0.01,
#     'feature_fraction': 0.19,
#     'bagging_freq': 10,
#     'bagging_fraction': 0.5,
#     'n_jobs': -1,
#     'lambda_l2': 5,
#     'min_data_in_leaf': 125,
#     'scale_pos_weight': 1.3,
#     'max_bins': 255,
#     'feature_fraction_bynode': 0.95,
#     'drop_rate': 0.11,
#     'skip_drop': 0.6
# }

In [85]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [86]:
n_est = [7500] * 5
# n_est = [9500] * 5

In [87]:
gc.collect()

1572

In [88]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_117_third_last', 'D_116_third_last', 'D_63_third_last', 'D_117_first', 'D_116_first', 'D_92_first', 'D_114_third_last', 'D_116_second_last', 'B_30_last', 'D_120_third_last', 'D_126_first', 'B_38_first', 'D_64_second_last', 'D_120_second_last', 'D_63_first', 'D_114_second_last', 'D_116_last', 'D_64_first', 'B_38_second_last', 'D_92_second_last', 'D_63_second_last', 'D_63_last', 'B_38_last', 'B_30_second_last', 'D_64_last', 'D_120_first', 'D_92_last', 'D_114_last', 'D_92_third_last', 'D_126_third_last', 'D_68_first', 'D_117_second_last', 'D_64_third_last', 'D_120_last', 'D_126_second_last', 'D_68_second_last', 'B_38_third_last', 'B_30_first', 'D_117_last', 'B_30_third_last', 'D_114_first', 'D_68_third_last', 'D_68_last', 'D_126_last']


In [89]:
len(cat_features)

44

In [90]:
train_agg.shape

(458913, 4137)

In [91]:
%%time
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 4.05 s, sys: 18 s, total: 22.1 s
Wall time: 31.8 s


72

In [None]:
del 

In [95]:
train_agg.to_parquet(f"../{EXP_PATH}/13.lgbm_dart_round_clip_diff_7788/final_train.parquet")