In [3]:
import catboost as cb
import gc
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import warnings
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [13]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [14]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [15]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read Data

In [9]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v6/train_agg.parquet")

Shape of data: (458913, 5064)
CPU times: user 17.8 s, sys: 29.4 s, total: 47.2 s
Wall time: 1min 28s


In [11]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


### Pre-train Feature Reduction

In [16]:
feature_imp = read_file(f"../{EXP_PATH}/12.lgbm_dart_manual_split_42/feature_importance.csv")

Shape of data: (183, 30)


In [29]:
def select_good_features(feature_imp_df, threshold=30):
    good_features = []
    for type_ in feature_imp_df.columns[1:]:
        good_features.extend(
            (
                feature_imp_df.loc[feature_imp_df[type_] >= threshold]["base_feature"] + "_" + type_
            ).tolist()
        )
    print(len(good_features), feature_imp_df.shape[0] * (feature_imp_df.shape[1] - 1))
    return good_features

In [30]:
selected_features = select_good_features(feature_imp)

3046 5307


In [31]:
train_agg = train_agg.loc[:, selected_features]

In [32]:
target = labels["target"].values

In [33]:
%%time
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 399 ms, sys: 2.73 s, total: 3.12 s
Wall time: 8.16 s


452

In [36]:
cat_features = get_cols(train_agg, CATEGORY_COLUMNS)

In [37]:
for cat_feature in tqdm(cat_features):
    if train_agg[cat_feature].isnull().sum() > 0:
        train_agg[cat_feature] = train_agg[cat_feature].fillna(-1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 34.58it/s]


In [38]:
train_agg[cat_features] = (train_agg[cat_features] + 1).astype(int)

In [39]:
train_agg.shape, target.shape

((458913, 3046), (458913,))

### Tune Catboost using Optuna (KFold)

In [41]:
prev_study = joblib.load(f"../catboost_new_exp/optuna_study.pkl")

In [44]:
tdf = prev_study.trials_dataframe()

In [49]:
tdf = tdf.loc[~tdf["value"].isnull()].reset_index(drop=True)

In [52]:
tdf.columns

Index(['number', 'value', 'datetime_start', 'datetime_complete', 'duration',
       'params_boosting_type', 'params_bootstrap_type',
       'params_colsample_bylevel', 'params_depth', 'params_l2_leaf_reg',
       'params_learning_rate', 'params_max_bin', 'params_min_data_in_leaf',
       'params_n_estimators', 'params_objective', 'params_subsample', 'state'],
      dtype='object')

In [67]:
def objective(trial):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1020)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        # "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.03, 0.1, log=True),
        "depth": trial.suggest_int("depth", 5, 8),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["MVS"]
        ),
        "n_estimators": trial.suggest_categorical("n_estimators", [1500, 1750, 2000]),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 2, 5, log=True),
        "min_data_in_leaf": trial.suggest_categorical("min_data_in_leaf", [1024, 2048]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.03, 0.06, 0.1]),
        "max_bin": trial.suggest_categorical("max_bin", [63, 127]),
        "used_ram_limit": "12gb",
    }

    if param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.6, 0.85)
    
    print(param)
    train_score_list, val_score_list = [], []
    for fold, (idx_tr, idx_va) in zip(range(1, 5+1), kf.split(train_agg, target)):
        fold = str(fold)
        X_train, y_train = train_agg.iloc[idx_tr], target[idx_tr]
        X_val, y_val = train_agg.iloc[idx_va], target[idx_va]
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=UserWarning)
            model = cb.CatBoostClassifier(**param)
            model.fit(
                X_train, 
                y_train, 
                eval_set=[(X_val, y_val)], 
                # verbose=0, 
                verbose_eval=250,
                cat_features=cat_features,
                early_stopping_rounds=100
            )
        y_train_pred = model.predict(X_train, prediction_type='RawFormulaVal')
        train_score, train_g, train_t4 = amex_metric(y_train, y_train_pred)
        train_data, X_train, y_train = None, None, None
        y_val_pred = model.predict(X_val, prediction_type='RawFormulaVal')
        val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)
        valid_data, X_val, y_val = None, None, None
        train_score_list.append(train_score)
        val_score_list.append(val_score)
        if val_score > best_scores_json["validation"][fold]:
            best_scores_json["train"][fold] = train_score
            best_scores_json["validation"][fold] = val_score
            with open(f'{CURRENT_EXP_PATH}/best_scores.json', "w") as outfile:
                json.dump(best_scores_json, outfile)
            joblib.dump(model, f'{CURRENT_EXP_PATH}/models/model{fold}.pkl')
        elif np.mean(train_score_list) >= np.mean(list(best_scores_json["train"].values())) + 0.02:
            print(f"Train score too high (overfitting), start a new trial")
            return np.mean(val_score_list)
        print(f"{Fore.BLUE}{Style.BRIGHT}Fold {fold} | Train Score = {train_score:.5f} ({train_g:.4f}, {train_t4:.4f})")
        print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | Val Score = {val_score:.5f} ({val_g:.4f}, {val_t4:.4f}){Style.RESET_ALL}")
        print(f"Clear cache {gc.collect()}")
        
    return np.mean(val_score_list)

In [68]:
# study = joblib.load(f"{CURRENT_EXP_PATH}/optuna_study.pkl")

In [71]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-08-19 05:37:20,301][0m A new study created in memory with name: no-name-5c5a1760-3a40-4d1e-b311-08666100b4d3[0m


In [72]:
study.optimize(objective, n_trials=50)

{'objective': 'Logloss', 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'n_estimators': 2000, 'l2_leaf_reg': 4.069578904464481, 'min_data_in_leaf': 1024, 'learning_rate': 0.1, 'max_bin': 63, 'used_ram_limit': '12gb'}
0:	learn: 0.5745848	test: 0.5751589	best: 0.5751589 (0)	total: 5.49s	remaining: 3h 2m 53s


KeyboardInterrupt: 

In [None]:
# joblib.dump(study, f"{CURRENT_EXP_PATH}/optuna_study.pkl")

In [62]:
study_df = study.trials_dataframe()

In [63]:
study_df = study_df.loc[study_df["state"] == "COMPLETE"]

In [None]:
# for col in get_cols(study_df, "params"):
#     sns.scatterplot(data=study_df, x=study_df[col], y=study_df["value"])
#     plt.show()