In [None]:
import datetime
import gc
import joblib
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
import optuna
import os
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import Counter
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [147]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric,
    TreeExperiment
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [148]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read Data

In [149]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v6/train_agg.parquet")

Shape of data: (458913, 5064)
CPU times: user 17.1 s, sys: 28 s, total: 45.1 s
Wall time: 41.2 s


In [150]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [151]:
# %%time
# train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
# gc.collect()

In [152]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_92_first', 'D_68_first', 'B_30_last', 'D_120_second_last', 'D_64_first', 'B_38_first', 'B_38_last', 'D_114_last', 'D_63_first', 'B_38_second_last', 'D_126_last', 'D_117_second_last', 'D_126_second_last', 'D_64_second_last', 'D_92_third_last', 'D_64_last', 'D_114_third_last', 'B_30_second_last', 'D_116_first', 'D_120_last', 'D_114_first', 'D_117_last', 'B_30_third_last', 'D_126_third_last', 'D_63_last', 'D_68_second_last', 'D_116_last', 'D_63_third_last', 'D_68_last', 'D_92_last', 'D_120_first', 'D_114_second_last', 'D_68_third_last', 'B_30_first', 'D_117_third_last', 'D_116_third_last', 'D_117_first', 'D_63_second_last', 'D_92_second_last', 'D_120_third_last', 'D_116_second_last', 'B_38_third_last', 'D_64_third_last', 'D_126_first']


In [153]:
# train_agg = pd.concat([train_agg, labels], axis=1)

In [154]:
train_agg.shape, target.shape

((458913, 5064), (458913,))

### Feature selection

In [155]:
%%time
lgbm_dart = TreeExperiment(
    exp_full_path="../../experiments/12.lgbm_dart_manual_split_42",
    seed=42, 
    model_path="dart_models"
)

CPU times: user 3.23 s, sys: 1.98 s, total: 5.21 s
Wall time: 1.39 s


In [156]:
fi = lgbm_dart.feature_imp_df

In [157]:
to_drop = fi.loc[fi.iloc[:, 1:6].max(axis=1) < 22]["feature"].tolist()

In [158]:
fi = fi.loc[~fi["feature"].isin(to_drop)].reset_index(drop=True)

In [159]:
# fi.to_csv("./previous_feature_importance.csv")

In [160]:
len(to_drop)

1056

In [161]:
train_agg.shape

(458913, 5064)

In [162]:
%%time
train_agg = train_agg.drop(columns=to_drop, errors="ignore")

CPU times: user 351 ms, sys: 2.01 s, total: 2.36 s
Wall time: 3.47 s


In [163]:
train_agg.shape

(458913, 4008)

In [165]:
prev_sprint_noob_cols = fi.loc[fi["agg_type"] == "previous_sprint"].nsmallest(60, "average_importance")["feature"].tolist()
third_last_noob_cols = fi.loc[fi["agg_type"] == "third_last"].nsmallest(90, "average_importance")["feature"].tolist()

In [166]:
%%time
train_agg = train_agg.drop(columns=prev_sprint_noob_cols + third_last_noob_cols, errors="ignore")

CPU times: user 358 ms, sys: 2.12 s, total: 2.48 s
Wall time: 3.73 s


In [167]:
train_agg.shape

(458913, 3858)

In [168]:
# set(train_agg.columns) - set(fi["feature"])

#### Find Categorical Variables

In [169]:
cat_columns = get_cols(train_agg, CATEGORY_COLUMNS)
cat_features = list(set(cat_columns).intersection(train_agg.columns))
print(cat_features)

['D_68_first', 'B_30_last', 'D_120_second_last', 'D_64_first', 'B_38_first', 'B_38_last', 'D_114_last', 'B_38_second_last', 'D_126_last', 'D_117_second_last', 'D_126_second_last', 'D_64_second_last', 'D_92_third_last', 'D_64_last', 'B_30_second_last', 'D_116_first', 'D_120_last', 'D_114_first', 'D_117_last', 'D_63_last', 'D_68_second_last', 'D_116_last', 'D_68_last', 'D_120_first', 'D_114_second_last', 'B_30_first', 'D_117_third_last', 'D_116_third_last', 'D_117_first', 'D_63_second_last', 'D_116_second_last', 'D_126_first']


In [180]:
# pd.DataFrame(dict(cat_min=train_agg.loc[:, cat_features].min(), 
#                   cat_null=train_agg.loc[:, cat_features].isnull().sum()))

In [181]:
for cat_feature in tqdm(cat_features):
    if train_agg[cat_feature].isnull().sum() > 0:
        train_agg[cat_feature] = train_agg[cat_feature].fillna(-1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 46.02it/s]


In [188]:
train_agg[cat_features] = (train_agg[cat_features] + 1).astype(int)

### REAL Stratify Split

In [189]:
%%time
normal_train_agg = train_agg.loc[labels["target"] == 0]
default_train_agg = train_agg.loc[labels["target"] == 1]

CPU times: user 1.14 s, sys: 3.18 s, total: 4.32 s
Wall time: 6.1 s


In [190]:
normal_predict_group_df = pd.read_csv(f"normal_predict_group.csv").drop(columns="target")
default_predict_group_df = pd.read_csv(f"default_predict_group.csv").drop(columns="target")

In [191]:
normal_train_agg = normal_train_agg.merge(normal_predict_group_df, on="customer_ID", how="left")
default_train_agg = default_train_agg.merge(default_predict_group_df, on="customer_ID", how="left")

In [192]:
# normal_train_agg = normal_train_agg.drop(columns=get_cols(normal_train_agg, "target"))
# default_train_agg = default_train_agg.drop(columns=get_cols(default_train_agg, "target"))

In [193]:
seed = 42

In [194]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [195]:
normal_indices = {}
for fold, (trn_ind, val_ind) in enumerate(kfold.split(normal_train_agg, normal_train_agg["group"])):
    normal_indices[fold] = (trn_ind, val_ind)

In [196]:
default_indices = {}
for fold, (trn_ind, val_ind) in enumerate(kfold.split(default_train_agg, default_train_agg["group"])):
    default_indices[fold] = (trn_ind, val_ind)

In [197]:
# kfold_indices = {}
# for fold in range(5):
#     a = normal_train_agg.loc[normal_indices[fold][1], "customer_ID"].tolist()
#     b = default_train_agg.loc[default_indices[fold][1], "customer_ID"].tolist()
#     kfold_indices[fold] = a + b
# joblib.dump(kfold_indices, "./5fold_val_cid.pkl")

### Hyperparams Setting

In [198]:
params = {
    'objective': 'binary',
    'first_metric_only': True,
    'metric': "binary_logloss",
    'boosting': 'dart',
    'device': "cpu",
    'seed': seed,
    'num_leaves': 95,
    'learning_rate': 0.01,
    'feature_fraction': 0.19,
    'bagging_freq': 10,
    'bagging_fraction': 0.5,
    'n_jobs': -1,
    'lambda_l2': 10,
    'min_data_in_leaf': 90,
    'scale_pos_weight': 1.38,
    'max_bins': 255,
    'feature_fraction_bynode': 0.9,
    'drop_rate': 0.095,
    'skip_drop': 0.52
}

In [199]:
n_est = [10500] * 5

In [200]:
# train_ = train_.drop(columns=["customer_ID", "group"], errors="ignore")

In [201]:
not_in_x_columns = ['customer_ID', 'target', 'group']

In [202]:
for fold in range(5):
    if fold < 4:
        continue
    n_estimator = n_est[fold]
    print(f"Number of estimator: {n_estimator}")
    xn = normal_train_agg.loc[normal_indices[fold][0]].drop(columns=not_in_x_columns)
    xd = default_train_agg.loc[default_indices[fold][0]].drop(columns=not_in_x_columns)
    x_train = pd.concat([xn, xd], ignore_index=True)
    print(f"X_train: {x_train.shape[0]}")
    xn = normal_train_agg.loc[normal_indices[fold][1]].drop(columns=not_in_x_columns)
    xd = default_train_agg.loc[default_indices[fold][1]].drop(columns=not_in_x_columns)
    x_val = pd.concat([xn, xd], ignore_index=True)
    print(f"X_val: {x_val.shape[0]}")
    
    yn = normal_train_agg.loc[normal_indices[fold][0], "target"]
    yd = default_train_agg.loc[default_indices[fold][0], "target"]
    y_train = pd.concat([yn, yd], ignore_index=True)
    print(f"Y_train: {y_train.shape[0]}")
    
    yn = normal_train_agg.loc[normal_indices[fold][1], "target"]
    yd = default_train_agg.loc[default_indices[fold][1], "target"]
    y_val = pd.concat([yn, yd], ignore_index=True)
    print(f"Y_validation: {y_val.shape[0]}")
    
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./dart_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

Number of estimator: 10500
X_train: 367130
X_val: 91783
Y_train: 367130
Y_validation: 91783
Start Training fold 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 529303
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 3831
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.291038	training's amex: 0.777948	valid_1's binary_logloss: 0.29667	valid_1's amex: 0.763353
[1000]	training's binary_logloss: 0.244996	training's amex: 0.792334	valid_1's binary_logloss: 0.254078	valid_1's amex: 0.773342
[1500]	training's binary_logloss: 0.223039	training's amex: 0.804124	valid_1's binary_logloss: 0.236139	valid_1's amex: 0.779351
[2000]	training's binary_logloss: 0.21091

KeyboardInterrupt: 

In [None]:
xx

In [118]:
for fold in range(5):
    
    n_estimator = n_est[fold]
    print(f"Number of estimator: {n_estimator}")
    xn = normal_train_agg.loc[normal_indices[fold][0]].drop(columns=not_in_x_columns)
    xd = default_train_agg.loc[default_indices[fold][0]].drop(columns=not_in_x_columns)
    x_train = pd.concat([xn, xd], ignore_index=True)
    print(f"X_train: {x_train.shape[0]}")
    xn = normal_train_agg.loc[normal_indices[fold][1]].drop(columns=not_in_x_columns)
    xd = default_train_agg.loc[default_indices[fold][1]].drop(columns=not_in_x_columns)
    x_val = pd.concat([xn, xd], ignore_index=True)
    print(f"X_val: {x_val.shape[0]}")
    
    yn = normal_train_agg.loc[normal_indices[fold][0], "target"]
    yd = default_train_agg.loc[default_indices[fold][0], "target"]
    y_train = pd.concat([yn, yd], ignore_index=True)
    print(f"Y_train: {y_train.shape[0]}")
    
    yn = normal_train_agg.loc[normal_indices[fold][1], "target"]
    yd = default_train_agg.loc[default_indices[fold][1], "target"]
    y_val = pd.concat([yn, yd], ignore_index=True)
    print(f"Y_validation: {y_val.shape[0]}")
    
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./dart_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

Number of estimator: 8000
X_train: 367130
X_val: 91783
Y_train: 367130
Y_validation: 91783
Start Training fold 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 566106
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 4487
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.288424	training's amex: 0.780779	valid_1's binary_logloss: 0.294466	valid_1's amex: 0.764587
[1000]	training's binary_logloss: 0.230785	training's amex: 0.798886	valid_1's binary_logloss: 0.241922	valid_1's amex: 0.776767
[1500]	training's binary_logloss: 0.215633	training's amex: 0.813358	valid_1's binary_logloss: 0.231175	valid_1's amex: 0.782011
[2000]	training's binary_logloss: 0.20512

In [None]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_agg, target)):
    if fold < 3:
        continue
    n_estimator = n_est[fold]
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {train_agg.shape[1]} features...')
    print('-'*50)
    x_train, x_val = train_agg.iloc[trn_ind], train_agg.iloc[val_ind]
    y_train, y_val = target[trn_ind], target[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)
    print(f"Start Training fold {fold}")
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = n_estimator,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 600,
        verbose_eval = 500,
        feval = lgb_amex_metric
    )
    # Save best model
    joblib.dump(model, f'./dart_models/model_fold{fold}_seed{seed}.pkl')
    # Predict validation
    y_val_pred = model.predict(x_val, raw_score=True)
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)                                      
    print(f'Our fold {fold} CV score is {val_score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 3 with 3983 features...
--------------------------------------------------
Start Training fold 3
[LightGBM] [Info] Number of positive: 95063, number of negative: 272068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 553242
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 3983
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258935 -> initscore=-1.051512
[LightGBM] [Info] Start training from score -1.051512
[500]	training's binary_logloss: 0.265894	training's amex: 0.784579	valid_1's binary_logloss: 0.270852	valid_1's amex: 0.776311
[1000]	training's binary_logloss: 0.220583	training's amex: 0.804852	valid_1's binary_logloss: 0.231342	valid_1's amex: 0.786518
[1500]	training's binary_logloss: 0.205965	training's amex: 0.821195	valid_1's binary_logloss: 0.22299	valid_1's amex: 