In [82]:
import catboost as cb
import gc
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import warnings
warnings.simplefilter("ignore")
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [83]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [84]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [85]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read Data

In [39]:
%%time
df_dict = {}
for i in range(5):
    df_dict[i] = read_file(f"../{PROCESSED_DATA_PATH}/v9/validation_fold{i}.parquet")

Shape of data: (82603, 4370)
Shape of data: (82603, 4370)
Shape of data: (82603, 4370)
Shape of data: (82602, 4370)
Shape of data: (82602, 4370)
CPU times: user 45.7 s, sys: 1min 53s, total: 2min 38s
Wall time: 9.57 s


In [40]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"].values

Shape of data: (458913, 2)


In [42]:
y_dict = {}
for i in range(5):
    y_dict[i] = df_dict[i]["target"].copy()

In [51]:
%%time
for i in range(5):
    df_dict[i] = df_dict[i].drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 1.14 s, sys: 1.95 s, total: 3.09 s
Wall time: 3.09 s


2858

In [57]:
cat_features = get_cols(df_dict[0], CATEGORY_COLUMNS)
print(cat_features)

[]


### Tune Catboost using Optuna (KFold)

In [31]:
# prev_study = joblib.load(f"../catboost_new_exp/optuna_study.pkl")
# tdf = prev_study.trials_dataframe()
# tdf = tdf.loc[~tdf["value"].isnull()].reset_index(drop=True)
# tdf.columns

In [86]:
def objective(trial):
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.03, 0.1, log=True),
        "depth": trial.suggest_int("depth", 5, 8),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered"]), #
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["MVS"]
        ),
        "n_estimators": trial.suggest_categorical("n_estimators", [1500, 1750, 2000]),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 2, 5, log=True),
        "min_data_in_leaf": trial.suggest_categorical("min_data_in_leaf", [1024, 2048]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.03, 0.06, 0.1]),
        "max_bin": trial.suggest_categorical("max_bin", [63, 127]),
        "used_ram_limit": "20gb",
        "thread_count": 16,
#         "task_type": "GPU",
        "devices": '0:1'
    }

    if param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.6, 0.85)
    
    print(param)
    train_score_list, val_score_list = [], []
    for fold in range(5):
        X_train = pd.concat([df for idx, df in df_dict.items() if idx != fold], ignore_index=True)
        X_val = df_dict[fold]
        print("X shape: ", X_train.shape, X_val.shape)
        y_train = pd.concat([y_dict[idx] for idx in range(5) if idx != fold], ignore_index=True)
        y_val = y_dict[fold]
        print("Y shape: ", y_train.shape, y_val.shape)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=UserWarning)
            model = cb.CatBoostClassifier(**param)
            model.fit(
                X_train, 
                y_train, 
                eval_set=[(X_val, y_val)], 
                use_best_model=True,
                metric_period=50,
                verbose_eval=200,
                cat_features=cat_features,
                early_stopping_rounds=200
            )
        y_train_pred = model.predict(X_train, prediction_type='RawFormulaVal')
        train_score, train_g, train_t4 = amex_metric(y_train, y_train_pred)
        train_data, X_train, y_train = None, None, None
        y_val_pred = model.predict(X_val, prediction_type='RawFormulaVal')
        val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)
        valid_data, X_val, y_val = None, None, None
        train_score_list.append(train_score)
        val_score_list.append(val_score)
        if val_score > best_scores_json["validation"][fold]:
            best_scores_json["train"][fold] = train_score
            best_scores_json["validation"][fold] = val_score
            with open(f'{CURRENT_EXP_PATH}/best_scores.json', "w") as outfile:
                json.dump(best_scores_json, outfile)
            joblib.dump(model, f'{CURRENT_EXP_PATH}/models/model{fold}.pkl')
        elif np.mean(train_score_list) >= np.mean(list(best_scores_json["train"].values())) + 0.02:
            print(f"Train score too high (overfitting), start a new trial")
            return np.mean(val_score_list)
        print(f"{Fore.BLUE}{Style.BRIGHT}Fold {fold} | Train Score = {train_score:.5f} ({train_g:.4f}, {train_t4:.4f})")
        print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | Val Score = {val_score:.5f} ({val_g:.4f}, {val_t4:.4f}){Style.RESET_ALL}")
        print(f"Clear cache {gc.collect()}")
        
    return np.mean(val_score_list)

In [87]:
# study = joblib.load(f"{CURRENT_EXP_PATH}/optuna_study.pkl")

In [88]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-08-20 19:01:54,771][0m A new study created in memory with name: no-name-166705e7-a283-47b4-b8ed-840f7d9b0138[0m


In [None]:
study.optimize(objective, n_trials=50)

{'objective': 'Logloss', 'colsample_bylevel': 0.09575321337561558, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'n_estimators': 2000, 'l2_leaf_reg': 3.59780135098431, 'min_data_in_leaf': 2048, 'learning_rate': 0.03, 'max_bin': 127, 'used_ram_limit': '20gb', 'thread_count': 16, 'devices': '0:1'}
X shape:  (330410, 4368) (82603, 4368)
Y shape:  (330410,) (82603,)


CatBoost is using more CPU RAM (61.5GiB) than the limit (20GiB)
CatBoost is using more CPU RAM (61.5GiB) than the limit (20GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(1148704) > ResourceQuota(0)
Resource CPU

In [None]:
# joblib.dump(study, f"{CURRENT_EXP_PATH}/optuna_study.pkl")

In [62]:
study_df = study.trials_dataframe()

In [63]:
study_df = study_df.loc[study_df["state"] == "COMPLETE"]

In [None]:
# for col in get_cols(study_df, "params"):
#     sns.scatterplot(data=study_df, x=study_df[col], y=study_df["value"])
#     plt.show()