In [1]:
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from lightgbm import LGBMClassifier
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from skopt import BayesSearchCV
from tqdm import tqdm



In [2]:
from utils.eval_helpers import plot_roc_curves, plot_feature_importance, amex_metric, get_final_metric_df, amex_metric_np
from utils.eda_helpers import plot_missing_proportion_barchart, get_cols
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS, MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES

In [3]:
DATA_PATH = "../raw_data"
os.listdir(DATA_PATH)

['train_ftr',
 'train_labels.csv',
 '.DS_Store',
 'train_csv',
 'test_ftr',
 'test_csv',
 '.ipynb_checkpoints',
 'test_parquet',
 'train_parquet',
 'sample_submission.csv']

In [53]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"

In [5]:
%load_ext autoreload
%autoreload

In [6]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")

### Read Data

In [7]:
train_data = read_file(f"{PROCESSED_DATA_PATH}/experiment_train_agg_data.parquet")

Shape of data: (458913, 590)


In [None]:
# train_data = pd.read_parquet(f"{PROCESSED_DATA_PATH}/train_agg_with_ma.parquet")
# train_cluster_data = pd.read_parquet(f"{PROCESSED_DATA_PATH}/train_cluster_agg.parquet")
# test_data = pd.read_feather(f"{PROCESSED_DATA_PATH}/test_agg_data.ftr")

In [None]:
# train_labels = pd.read_csv(f"{DATA_PATH}/train_labels.csv")
# train_labels.shape
# train_labels.columns

### Copy from Kaggle

In [8]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats
import warnings
from colorama import Fore, Back, Style
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay
from lightgbm import log_evaluation

plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [9]:
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [10]:
def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(y_true, y_pred),
            True)

In [11]:
mean_features = [f"{col}_mean" for col in MEAN_FEATURES]
mean_features[:3]

['B_1_mean', 'B_2_mean', 'B_3_mean']

In [12]:
min_features = [f"{col}_min" for col in MIN_FEATURES]
min_features[:3]

['B_2_min', 'B_4_min', 'B_5_min']

In [13]:
max_features = [f"{col}_max" for col in MAX_FEATURES]
max_features[:3]

['B_1_max', 'B_2_max', 'B_3_max']

In [14]:
last_features = [f"{col}_last" for col in LAST_FEATURES]
last_features[:3]

['B_1_last', 'B_2_last', 'B_3_last']

In [15]:
ma_features = [col for col in train_data.columns if col.endswith("_ma")]
ma_features[:3]

['B_1_ma', 'B_2_ma', 'B_3_ma']

In [16]:
target = train_data["target"].values.astype(int)

In [17]:
%%time
# Cross-validation of the classifier
ONLY_FIRST_FOLD = False
features = min_features + max_features + last_features + ma_features # mean_features + 
excluded_columns = ['B_30_mean', 'B_38_mean', 'D_114_mean', 'D_117_mean', 'D_120_mean', 'D_126_mean', 
                    'B_30_max', 'B_38_max', 'D_63_max', 'D_64_max', 'D_116_max', 'D_117_max', 'D_126_max',
                    'D_68_last', 'D_117_last']
features = [f for f in features if f not in excluded_columns]
print(f"{len(features)} features")

455 features
CPU times: user 1.06 ms, sys: 261 µs, total: 1.32 ms
Wall time: 1.16 ms


In [18]:
def my_booster(random_state=1, n_estimators=1400):
    return LGBMClassifier(n_estimators=n_estimators,
                          learning_rate=0.03, 
                          reg_alpha=10, 
                          reg_lambda=40,
                          min_child_samples=2200,
                          num_leaves=100,
                          colsample_bytree=0.18,
                          subsample=0.85,
                          max_bins=500, 
                          scale_pos_weight=2.5,
                          random_state=1)

In [25]:
# train[["D_63_last", "D_64_last"]] = train[["D_63_last", "D_64_last"]].astype("category")

In [26]:
# train = train_data.copy()
# del train_data

In [27]:
n_est_list = [2200, 1500, 2000, 1000, 1600, 1800, 1800, 1200, 1400, 1000]

In [35]:
%%time
model_list, score_list, y_pred_list, held_out_index_list = [], [], [], []
model_dict = {}
X_val_dict = {}
y_val_dict = {}
y_score_dict = {}
kf = StratifiedKFold(n_splits=10)
for fold, (idx_tr, idx_va), n_est in zip(range(1, 10+1), kf.split(train, target), n_est_list):
    model_dict[fold] = joblib.load(f"{MODELS_PATH}/lgbm_models/model_{fold}.pkl")
    X_val_dict[fold] = train.iloc[idx_va][features]
    y_val_dict[fold] = target[idx_va]
    y_score_dict[fold] = model_dict[fold].predict_proba(X_val_dict[fold], raw_score=True)

CPU times: user 2min 33s, sys: 5.95 s, total: 2min 39s
Wall time: 18 s


In [37]:
for i in range(1, 11):
    print(i, amex_metric_np(y_score_dict[i], y_val_dict[i]))

1 (0.7926485245718988, 0.9225014588046577, 0.6627955903391399)
2 (0.7964579447972834, 0.9232196849324444, 0.6696962046621223)
3 (0.7925930226971521, 0.9231478395540282, 0.6620382058402761)
4 (0.7926399569083695, 0.9236067948132042, 0.6616731190035348)
5 (0.7950098352708671, 0.9243909885016736, 0.6656286820400606)
6 (0.7910202913078714, 0.9238734531029936, 0.6581671295127494)
7 (0.7938510076983318, 0.9247381173911096, 0.6629638980055541)
8 (0.7992186922869893, 0.9268897955812998, 0.6715475889926786)
9 (0.798622045487242, 0.9257806558150126, 0.6714634351594715)
10 (0.7952555821646151, 0.9239286514957706, 0.6665825128334596)


In [48]:
full_train = pd.concat(list(X_val_dict.values()))
full_train.shape

(458913, 455)

In [47]:
full_train_gt = np.concatenate(list(y_val_dict.values()))
len(full_train_gt)

458913

In [49]:
full_train_scores = np.concatenate(list(y_score_dict.values()))
len(full_train_scores)

458913

In [50]:
full_train.loc[:, "target"] = full_train_gt
full_train.loc[:, "score"] = full_train_scores

In [54]:
# full_train.to_csv(f"{EVALUATION_DATA_PATH}/train_single_raw_score.csv", index=False)

In [55]:
full_train

Unnamed: 0,B_2_min,B_4_min,B_5_min,B_9_min,B_13_min,B_14_min,B_15_min,B_16_min,B_17_min,B_19_min,...,S_13_ma,S_15_ma,S_16_ma,S_18_ma,S_22_ma,S_23_ma,S_25_ma,S_26_ma,target,score
0,1.000242,0.000836,0.060492,0.000519,0.074886,0.009725,0.007219,0.000227,,0.000408,...,0.752799,0.226097,0.003458,0.004205,0.918648,0.134098,0.974310,0.008391,0,-6.496211
1,0.819772,0.001098,0.004075,0.001722,0.008499,0.001797,0.000095,0.002940,,0.001015,...,0.164745,0.317035,0.004295,0.007069,0.920174,0.135965,0.975657,0.003347,0,-6.449298
2,0.810796,0.013140,0.000215,0.000422,0.000427,0.000684,0.000019,0.001977,,0.000319,...,0.004189,0.505452,0.005355,0.002693,0.301263,0.134190,0.973330,0.005911,0,-5.601048
3,0.812053,0.003544,0.000228,0.001702,0.013755,0.006169,0.000218,0.004716,0.001711,0.001465,...,0.423689,0.467034,0.005881,0.004615,0.937522,0.135757,0.974665,0.019306,0,-4.880780
4,0.810670,0.041346,0.001201,0.002925,0.000626,0.000025,0.001513,0.000583,,0.000073,...,0.005394,0.490800,0.004895,0.002655,0.343495,0.134121,0.973398,0.004616,0,-5.740057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458908,1.000524,0.004972,0.040532,0.013571,0.141432,0.009684,0.000412,0.000289,,0.000197,...,0.440430,0.321818,0.005627,0.003546,0.952955,0.137854,0.974315,0.106441,0,-5.373140
458909,0.030446,0.197449,0.013285,0.143207,0.004136,0.068728,0.000122,1.000218,0.793540,0.750326,...,0.003654,0.501712,0.006267,0.023204,0.373902,0.135836,0.974862,0.003836,0,-3.536383
458910,0.004150,0.016421,0.016552,0.020363,0.036302,0.011283,0.000144,0.167052,0.000600,0.000187,...,0.375937,0.175208,0.005876,0.002034,0.945600,0.137512,0.974249,0.006207,0,-6.167538
458911,0.182818,0.132518,0.001633,0.001419,0.006190,0.001695,0.001029,0.423637,0.005333,0.000595,...,0.006610,0.535439,0.005432,0.003920,0.914448,0.134605,0.974570,0.004380,1,-1.637748


In [115]:
%%time
model_list, score_list, y_pred_list, held_out_index_list = [], [], [], []
kf = StratifiedKFold(n_splits=10)
for fold, (idx_tr, idx_va), n_est in zip(range(1, 10+1), kf.split(train, target), n_est_list):
    X_train, X_val, y_train, y_val, model = None, None, None, None, None
    start_time = datetime.datetime.now()
    X_train = train.iloc[idx_tr][features]
    X_val = train.iloc[idx_va][features]
    y_train = target[idx_tr]
    y_val = target[idx_va]
    
    model = my_booster(n_estimators=n_est)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_train, 
                  y_train,
                  eval_set=[(X_val, y_val)], 
                  eval_metric=[lgb_amex_metric],
                  callbacks=[log_evaluation(200)])
    X_train, y_train = None, None
    y_val_pred = model.predict_proba(X_val, raw_score=True)
    score = amex_metric(y_val, y_val_pred)
    n_trees = model.best_iteration_
    if n_trees is None: 
        n_trees = model.n_estimators
    print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f" {n_trees:5} trees |"
          f"                Score = {score:.5f}{Style.RESET_ALL}")
    score_list.append(score)
    model_list.append(model)
    held_out_index_list.append(idx_va)
    # if INFERENCE:
    #     y_pred_list.append(model.predict_proba(test[features], raw_score=True))
        
    # if ONLY_FIRST_FOLD:
    #     break # we only want the first fold
    
print(f"{Fore.GREEN}{Style.BRIGHT}OOF Score:                       {np.mean(score_list):.5f}{Style.RESET_ALL}")

[200]	valid_0's binary_logloss: 0.257673	valid_0's amex: 0.779884
[400]	valid_0's binary_logloss: 0.248665	valid_0's amex: 0.786612
[600]	valid_0's binary_logloss: 0.245106	valid_0's amex: 0.78903
[800]	valid_0's binary_logloss: 0.242709	valid_0's amex: 0.789847
[1000]	valid_0's binary_logloss: 0.240813	valid_0's amex: 0.790951
[1200]	valid_0's binary_logloss: 0.239233	valid_0's amex: 0.791413
[1400]	valid_0's binary_logloss: 0.238026	valid_0's amex: 0.792171
[1600]	valid_0's binary_logloss: 0.236806	valid_0's amex: 0.79263
[32m[1mFold 1 | 02:19 |  1600 trees |                Score = 0.79265[0m
[200]	valid_0's binary_logloss: 0.253284	valid_0's amex: 0.780862
[400]	valid_0's binary_logloss: 0.244367	valid_0's amex: 0.789513
[600]	valid_0's binary_logloss: 0.241008	valid_0's amex: 0.793303
[800]	valid_0's binary_logloss: 0.238839	valid_0's amex: 0.794986
[1000]	valid_0's binary_logloss: 0.237113	valid_0's amex: 0.795514
[1200]	valid_0's binary_logloss: 0.235597	valid_0's amex: 0.7958

In [118]:
for idx, model in enumerate(model_list):
    joblib.dump(model, f'../models/lgbm_models/model_{idx+1}.pkl')

### Train Val Split

In [90]:
train, val = train_test_split(train_data, test_size=0.175, random_state=923, stratify=train_data["target"])

NameError: name 'train_data' is not defined

In [None]:
train["target"].mean(), val["target"].mean()

In [None]:
special_columns = ['customer_ID', 'S_2', 'target', "Date", "min_date", "max_date", "num_records", "days", "record_per_day"]

In [None]:
all_cols = [c for c in train.columns if c not in special_columns]
cat_features = train.select_dtypes("category").columns.tolist()
num_features = [col for col in all_cols if col not in cat_features]

In [None]:
len(all_cols), len(cat_features), len(num_features)

In [None]:
cluster_columns = get_cols(train, "_cluster")
nunique_columns = get_cols(train, "_nunique")
non_D66_count_columns = [col for col in get_cols(train, "_count") if "D_66" not in col]

In [None]:
selected_features = list(set(all_cols) - set(cluster_columns) - set(nunique_columns) - set(non_D66_count_columns))

In [None]:
len(all_cols), len(selected_features)

In [None]:
X_train = train.loc[:, selected_features]
X_val = val.loc[:, selected_features]

In [None]:
y_train = train["target"]
y_val = val["target"]

In [None]:
def _amex_metric(dy_true, dy_pred):
    """An eval metric that always returns the same value"""
    metric_name = 'Amex Metric'
    value = amex_metric_np(dy_pred, dy_true)
    is_higher_better = True
    return metric_name, value, is_higher_better

In [None]:
fit_params = {"early_stopping_rounds" : 200, 
              # "eval_metric" : 'auc', 
              "eval_set" : [(X_val, y_val)],
              'eval_names': ['valid'],
              'verbose': 1,
              'categorical_feature': 'auto'}

In [None]:
param_test = {'learning_rate' : [0.01, 0.03, 0.05, 0.07, 0.08, 0.1, 0.12],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000],
              'num_leaves': sp_randint(10, 150), 
              'min_child_samples': sp_randint(25, 300), 
              'min_child_weight': [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1],
              'min_split_gain': [0.001, 0.003, 0.01, 0.025, 0.05, 0.1],
              'subsample': sp_uniform(loc=0.4, scale=0.6), 
              # 'subsample_freq': [5, 10, 15],
              'max_depth': [-1, 3, 5, 7, 9, 11, 13, 15],
              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
              'reg_alpha': [0, 1e-1, 3e-1, 1, 3, 6, 10, 20, 50],
              'reg_lambda': [0, 1e-1, 3e-1, 1, 3, 6, 10, 20, 50],
              'scale_pos_weight': [0, 1, 2, 3, 4],
              'xgboost_dart_mode': [True, False]
              }

# Number of combinations
n_iter = 50 

In [None]:
# Intializing lgbm and lunching the search
lgbm_clf = LGBMClassifier(random_state=1020, 
                          objective="binary", 
                          boosting_type='dart', 
                          n_jobs=-1) # silent=True, 

In [None]:
grid_search = RandomizedSearchCV(
    estimator=lgbm_clf, 
    param_distributions=param_test, 
    n_iter=n_iter,
    scoring=_amex_metric,
    cv=5,
    refit=True,
    random_state=923,
    verbose=True)

In [None]:
grid_search.fit(X_train, y_train, **fit_params)
opt_parameters = grid_search.best_params_

In [None]:
best_params = {
    'boosting_type': 'gbdt',
    'colsample_bytree': 0.8,
    'importance_type': 'split',
    'learning_rate': 0.07,
    'max_depth': 12,
    'min_child_samples': 30,
    'min_child_weight': 0.003,
    'min_split_gain': 0.02,
    'n_estimators': 110,
    'n_jobs': -1,
    'num_leaves': 80,
    'objective': "binary",
    'random_state': 923,
    'reg_alpha': 1,
    'reg_lambda': 4,
    'silent': True,
    'subsample': 0.85,
    'subsample_for_bin': 100000,
    'subsample_freq': 5,
    'scale_pos_weight': 4
}

In [None]:
lgbm_clf = LGBMClassifier(**best_params)

In [None]:
lgbm_clf.fit(X_train, y_train)

In [None]:
y_train_pred = lgbm_clf.predict_proba(X_train)[:, 1]
y_val_pred = lgbm_clf.predict_proba(X_val)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
imp_df = plot_feature_importance(lgbm_clf.feature_name_, 
                                 lgbm_clf.feature_importances_, 
                                 title="Feature Importance",
                                 limit=50)

In [None]:
result = pd.DataFrame()
for keyword in ["_mean", "std", "min", "max", "first", "last", "count"]:
    temp = imp_df.loc[imp_df["feature"].str.contains(keyword)].describe().rename(columns={"feature_importance": keyword})
    result = pd.concat([result, temp], axis=1)

In [None]:
result

In [None]:
cluster_feature_imp_df = imp_df.loc[imp_df["feature"].str.contains("count")]

In [None]:
cluster_result = pd.DataFrame()
for keyword in ["B_", "S_", "R_", "P_", "D_"]:
    temp = cluster_feature_imp_df.loc[cluster_feature_imp_df["feature"].str.contains(keyword)].describe().rename(columns={
        "feature_importance": keyword
    })
    cluster_result = pd.concat([cluster_result, temp], axis=1)

In [None]:
imp_df.shape[0], result.loc["count"].sum()

### Test Metric

In [None]:
y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
y_val_pred_df = pd.DataFrame(y_val_pred).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_val_df, y_val_pred_df)

In [None]:
amex_metric(y_val_df, y_val_pred_df)

In [None]:
amex_metric(y_val_df, y_val_pred_df)

In [None]:
# feature_imp_thr = imp_df.loc[imp_df["feature"] == "dummy"]["feature_importance"].values[0]
# feature_imp_thr

In [None]:
selected_features = imp_df.loc[imp_df["feature_importance"] > 0]["feature"].tolist()
len(selected_features)

### Train once

In [None]:
best_params = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.7,
 'importance_type': 'split',
 'learning_rate': 0.07,
 'max_depth': 7,
 'min_child_samples': 50,
 'min_child_weight': 0.05,
 'min_split_gain': 0.04,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 80,
 'objective': None,
 'random_state': 1020,
 'reg_alpha': 5,
 'reg_lambda': 15,
 'silent': True,
 'subsample': 0.7,
 'subsample_for_bin': 20000,
 'subsample_freq': 5,
 'scale_pos_weight': 1}

In [None]:
final_lgbm_clf = LGBMClassifier(**best_params)

In [None]:
final_lgbm_clf.fit(X_train, y_train)

In [None]:
y_train_pred = final_lgbm_clf.predict_proba(X_train)[:, 1]
y_val_pred = final_lgbm_clf.predict_proba(X_val)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
imp_df = plot_feature_importance(final_lgbm_clf.feature_name_, 
                                 final_lgbm_clf.feature_importances_, 
                                 title="Feature Importance",
                                 limit=50)

In [None]:
# feature_imp_thr = imp_df.loc[imp_df["feature"] == "dummy"]["feature_importance"].values[0]
# feature_imp_thr

In [None]:
selected_features = imp_df.loc[imp_df["feature_importance"] > 0]["feature"].tolist()
len(selected_features)

#### Final x2 LGBM

In [None]:
selected_features = [col for col in selected_features if "first" not in col]
len(selected_features)

In [None]:
final_lgbm_clf = LGBMClassifier(**best_params)

In [None]:
X_train_ = X_train.loc[:, selected_features]
X_val_ = X_val.loc[:, selected_features]

In [None]:
final_lgbm_clf.fit(X_train_, y_train)

In [None]:
y_train_pred_ = final_lgbm_clf.predict_proba(X_train_)[:, 1]
y_val_pred_ = final_lgbm_clf.predict_proba(X_val_)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred_, y_val_pred_], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
y_res_df = pd.DataFrame(y_res).reset_index(drop=True)
y_res_pred_df = pd.DataFrame(y_res_pred_).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_res_df, y_res_pred_df)

In [None]:
y_val.values[:40]

In [None]:
temp_ = [1 if c > 0.95 else c for c in y_val_pred_]
# temp_[:40]

In [None]:
y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
y_val_pred_df = pd.DataFrame(y_val_pred_).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_val_df, y_val_pred_df)

### END

### LGBM Model

In [None]:
opt = BayesSearchCV(
    LGBMClassifier(random_state=1020),
    {
        'learning_rate': (0.04, 0.1),
        'num_leaves': (31, 127),
        'max_depth': (5, 20),
        'min_child_samples': (15, 63),
        'n_estimators': (80, 160),
        'subsample': (0.7, 0.9),
        'subsample_freq': (2, 5),
        'colsample_bytree': (0.6, 0.95),
        'reg_alpha': (0, 15),
        'reg_lambda': (0, 15),
        'min_split_gain': (0, 0.05),
        'scale_pos_weight': (0.1, 10)
    },
    n_iter=5,
    cv=5,
    scoring=make_scorer(fbeta_score, beta=2)
)

In [None]:
start = time.time()
opt.fit(X_train, y_train, verbose=1)
end = time.time()
print(f"Done in {end - start:.2f} seconds")

In [None]:
best_params = opt.best_estimator_.get_params()

In [None]:
best_params

In [None]:
imp_df = plot_feature_importance(final_lgbm_clf.feature_name_, 
                                 final_lgbm_clf.feature_importances_, 
                                 title="Feature Importance",
                                 limit=50)

In [None]:
num_features_list, train_score_list, val_score_list = [], [], []
for imp_value in tqdm(sorted(imp_df["feature_importance"].unique())):
    selected_features = imp_df.loc[imp_df["feature_importance"] > imp_value]["feature"].tolist()
    print(f"# of features: {len(selected_features)}")
    
    X_train_new = train.loc[:, selected_features]
    X_val_new = val.loc[:, selected_features]
    
    lgbm_clf = LGBMClassifier(**params)
    lgbm_clf.fit(X_train_new, y_train)
    
    y_train_pred = lgbm_clf.predict_proba(X_train_new)[:, 1]
    y_val_pred = lgbm_clf.predict_proba(X_val_new)[:, 1]
    
    y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
    y_train_pred_df = pd.DataFrame(y_train_pred).rename(columns={0: "prediction"})
    y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
    y_val_pred_df = pd.DataFrame(y_val_pred).rename(columns={0: "prediction"})
    
    train_score = amex_metric(y_train_df, y_train_pred_df)
    val_score = amex_metric(y_val_df, y_val_pred_df)
    num_features_list.append(len(selected_features))
    train_score_list.append(train_score)
    val_score_list.append(val_score)

In [None]:
train_scores = [i[0] for i in train_score_list]
val_scores = [i[0] for i in val_score_list]

In [None]:
eval_df = pd.DataFrame(dict(num_feature=num_features_list, 
                            train_score=train_scores, 
                            val_score=val_scores)).iloc[:-10]

In [None]:
plt.figure(figsize=(17, 6))
plt.plot(eval_df["num_feature"], eval_df["train_score"], label="Train")
plt.plot(eval_df["num_feature"], eval_df["val_score"], label="Validation")
plt.legend()
plt.show()

### LGBM Error Analysis

In [None]:
X_validation = X_val_.reset_index(drop=True)

In [None]:
top4_pct_df, gini_df = get_final_metric_df(X_validation, y_val_df, y_val_pred_df)

In [None]:
failed_top4_pct_indices = top4_pct_df.loc[(top4_pct_df["is_cutoff"] == 1) & (top4_pct_df["target"] == 0)].index

In [None]:
# failed_top4_pct_indices

In [None]:
failed_top4_pct_indices

In [None]:
success_top4 = top4_pct_df.loc[~top4_pct_df.index.isin(failed_top4_pct_indices)]
failed_top4 = top4_pct_df.loc[failed_top4_pct_indices]

In [None]:
success_top4.shape, failed_top4.shape

In [None]:
column = "P_2_last"

In [None]:
top4_pct_df.groupby("target")[column].mean()

In [None]:
success_top4[column].describe()

In [None]:
failed_top4[column].describe()

In [None]:
# save model
# joblib.dump(final_lgbm_clf, '../models/lgbm_version1.pkl')

In [None]:
# load model
loaded_lgbm_model = joblib.load('../models/lgbm_version1.pkl')

### SMOTE

In [None]:
train.loc[:, num_features] = train.loc[:, num_features].fillna(0)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=923)
X_res, y_res = sm.fit_resample(train.loc[:, num_features], 
                               train["target"])

In [None]:
X_res.shape, y_res.shape

### Inference

In [None]:
X_test = test_data.loc[:, selected_features]

In [None]:
y_test_pred = loaded_lgbm_model.predict_proba(X_test)[:, 1]

In [None]:
test_data["prediction"] = y_test_pred

In [None]:
test_data.head()

#### Submission

In [None]:
submission = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")

In [None]:
submission = submission.drop(columns="prediction")

In [None]:
result = submission.merge(test_data[["customer_ID", "prediction"]], on="customer_ID")

In [None]:
# result.to_csv(f"{SUBMISSION_DATA_PATH}/submission4.csv", index=False)