In [1]:
import datetime
import gc
import joblib
import json
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import random
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [2]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [3]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [4]:
%load_ext autoreload
%autoreload

### Read Data

In [5]:
%%time
df_dict = {}
for i in tqdm(range(5)):
    df_dict[i] = read_file(f"../{PROCESSED_DATA_PATH}/v10/validation_fold{i}.pkl")

 20%|███████████████████████████▏                                                                                                            | 1/5 [00:00<00:03,  1.03it/s]

Shape of data: (82603, 4083)


 40%|██████████████████████████████████████████████████████▍                                                                                 | 2/5 [00:01<00:02,  1.01it/s]

Shape of data: (82603, 4083)


 60%|█████████████████████████████████████████████████████████████████████████████████▌                                                      | 3/5 [00:03<00:02,  1.12s/it]

Shape of data: (82603, 4083)


 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4/5 [00:04<00:01,  1.20s/it]

Shape of data: (82602, 4083)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.21s/it]

Shape of data: (82602, 4083)
CPU times: user 2.02 s, sys: 3.33 s, total: 5.35 s
Wall time: 6.08 s





In [6]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [7]:
y_dict = {}
for i in range(5):
    y_dict[i] = df_dict[i]["target"].copy()

In [8]:
%%time
for i in tqdm(range(5)):
    df_dict[i].drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore", inplace=True)
gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.62it/s]

CPU times: user 442 ms, sys: 1.99 s, total: 2.43 s
Wall time: 3.13 s





0

In [9]:
prev_model = joblib.load(f"../{EXP_PATH}/1.lgbm_dart_923/models/model_fold2_seed923.pkl")

In [10]:
len(prev_model.feature_name())

2526

In [11]:
%%time
for df in tqdm(df_dict.values()):
    df.drop(columns=list(set(df.columns) - set(prev_model.feature_name())), inplace=True, errors="ignore")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.94it/s]

CPU times: user 434 ms, sys: 1.38 s, total: 1.81 s
Wall time: 1.7 s





In [12]:
# read_file(f"../{EXP_PATH}/1.lgbm_dart_923/feature_importance.csv")

In [13]:
drift_columns = get_cols(df, ["D_87", "S_9"])

In [14]:
%%time
for df in tqdm(df_dict.values()):
    df.drop(columns=drift_columns, errors="ignore", inplace=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.83it/s]

CPU times: user 215 ms, sys: 851 ms, total: 1.07 s
Wall time: 1.31 s





In [15]:
cat_features = get_cols(df_dict[0], CATEGORY_COLUMNS)
print(cat_features)

['B_38_last', 'D_63_last', 'D_64_last', 'D_68_last', 'D_92_last', 'D_114_last', 'D_116_last', 'D_117_last', 'D_120_last', 'D_126_last', 'B_30_second_last', 'B_38_second_last', 'D_64_second_last', 'D_68_second_last', 'D_92_second_last', 'D_114_second_last', 'D_116_second_last', 'D_117_second_last', 'D_126_second_last', 'B_30_first', 'B_38_first', 'D_64_first', 'D_68_first', 'D_114_first', 'D_116_first', 'D_117_first', 'D_120_first']


In [16]:
# m = plot_missing_proportion_barchart(df_dict[0])

In [17]:
# high_missing_columns = m.loc[m["missing_proportion"] > 99.9]["column"].tolist()
# len(high_missing_columns)

In [18]:
# %%time
# for df in tqdm(df_dict.values()):
#     df = df.drop(columns=high_missing_columns, errors="ignore")

In [19]:
%%time
for df in tqdm(df_dict.values()):
    df.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore", inplace=True)
gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.66it/s]


CPU times: user 284 ms, sys: 621 ms, total: 905 ms
Wall time: 930 ms


159

In [20]:
cat_features = get_cols(df_dict[0], CATEGORY_COLUMNS)

In [21]:
print(cat_features)

['B_38_last', 'D_63_last', 'D_64_last', 'D_68_last', 'D_92_last', 'D_114_last', 'D_116_last', 'D_117_last', 'D_120_last', 'D_126_last', 'B_30_second_last', 'B_38_second_last', 'D_64_second_last', 'D_68_second_last', 'D_92_second_last', 'D_114_second_last', 'D_116_second_last', 'D_117_second_last', 'D_126_second_last', 'B_30_first', 'B_38_first', 'D_64_first', 'D_68_first', 'D_114_first', 'D_116_first', 'D_117_first', 'D_120_first']


In [22]:
# train_agg.shape, target.shape

### Train LGBM using pre-set hyperparams

In [31]:
seed = 923

In [32]:
params = {
    'objective': 'binary',
    'metric': "binary_logloss",
    'boosting': 'dart',
    'seed': seed,
    'num_leaves': 89,
    'learning_rate': 0.0118,
    'feature_fraction': 0.195,
    'bagging_freq': 9,
    'bagging_fraction': 0.57,
    'n_jobs': -1,
    'lambda_l2': 16,
    'min_data_in_leaf': 80,
    'scale_pos_weight': 1.4
}

In [33]:
target = labels["target"].values

In [34]:
# m = plot_feature_importance(model.feature_name(), model.feature_importance(), limit=70)

In [35]:
# noob_features1 = m.loc[m["feature_importance"] < 5]["feature"].tolist()

In [36]:
# %%time
# for df in tqdm(df_dict.values()):
#     df.drop(columns=noob_features1, errors="ignore", inplace=True)
# gc.collect()

In [37]:
print(df_dict[0].shape)

(82603, 2485)


In [38]:
for fold in range(5):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {df_dict[0].shape[1]} features...')
    print('-'*50)
    x_train = pd.concat([df for idx, df in df_dict.items() if idx != fold], ignore_index=True)
    x_val = df_dict[fold]
    print("X shape: ", x_train.shape, x_val.shape)
    y_train = pd.concat([y_dict[idx] for idx in range(5) if idx != fold], ignore_index=True)
    y_val = y_dict[fold]
    print("Y shape: ", y_train.shape, y_val.shape)
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = 9000,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 300,
        verbose_eval = 1000,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0 with 2485 features...
--------------------------------------------------
X shape:  (330410, 2485) (82603, 2485)
Y shape:  (330410,) (82603,)
[LightGBM] [Info] Number of positive: 85542, number of negative: 244868
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 394702
[LightGBM] [Info] Number of data points in the train set: 330410, number of used features: 2466
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258897 -> initscore=-1.051712
[LightGBM] [Info] Start training from score -1.051712
[1000]	training's binary_logloss: 0.241675	training's amex: 0.794774	valid_1's binary_logloss: 0.249609	valid_1's amex: 0.776322
[2000]	training's binary_logloss: 0.208224	training's amex: 0.822542	valid_1's binary_logloss: 0.226201	valid_1's amex: 0.78866
[3000]	training's binary_logloss: 0.193622	training's amex: 0.84316	valid