In [9]:
import catboost as cb
import gc
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import sys
sys.path.append("../")
import warnings
warnings.simplefilter("ignore")
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)
from utils.feature_engineering_helpers import feature_gen_pipeline

In [3]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [4]:
%load_ext autoreload
%autoreload

### Read Data

In [5]:
%%time
risk_df = read_file(f"{INTERIM_DATA_PATH}/v6/train_parquet/train_risk.parquet")
spend_payment_df = read_file(f"{INTERIM_DATA_PATH}/v6/train_parquet/train_spend_payment.parquet")

Shape of data: (5531451, 31)
Shape of data: (5531451, 24)
CPU times: user 5.47 s, sys: 3.23 s, total: 8.7 s
Wall time: 3.95 s


In [84]:
train_agg = read_file(f"{PROCESSED_DATA_PATH}/v6/train_agg.parquet")


Shape of data: (458913, 5064)


In [85]:
labels = read_file(f"{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [86]:
# %%time
# train = pd.concat([
#     spend_payment_df, 
#     risk_df.loc[:, get_cols(risk_df, "R_")]
# ], axis=1)

In [87]:
# train.loc[:, get_cols(train, "R_")] = train.loc[:, get_cols(train, "R_")].fillna(-127)

#### Generate Feature

In [88]:
# %%time
# train_agg, keep_column = feature_gen_pipeline(train)

In [89]:
# train_agg.head(3)

In [71]:
# len(features_to_drop)
# train_agg = train_agg.drop(columns=list(features_to_drop), errors="ignore")

In [91]:
target = labels["target"].values

In [92]:
%%time
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS + ["target"], errors="ignore")
gc.collect()

CPU times: user 4.96 s, sys: 19.6 s, total: 24.5 s
Wall time: 35.3 s


642

In [93]:
cat_features = train_agg.select_dtypes("category").columns.tolist()

In [94]:
train_agg.loc[:, cat_features] = train_agg.loc[:, cat_features].astype(float).fillna(-1).astype(int).astype("category")

In [158]:
feature_imp = dict(zip(model.feature_names_, model.feature_importances_))

In [164]:
noob_features = [k for k, v in feature_imp.items() if v < 0.005]

In [165]:
len(noob_features)

600

In [166]:
train_agg = train_agg.drop(noob_features, errors="ignore")

### Tune Catboost

In [167]:
def objective(trial):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1020)

    param = {
        "objective": "Logloss",
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.03, 0.1, log=True),
        "depth": trial.suggest_int("depth", 8, 15),
        "random_strength": trial.suggest_float("random_strength", 1.1, 1.5),
        "class_weights": trial.suggest_categorical("class_weights", [[1, 1.25], [1, 1.5], [1, 2]]),
        "n_estimators": trial.suggest_categorical("n_estimators", [500, 750, 1000]),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 2, 8, log=True),
        "min_data_in_leaf": trial.suggest_categorical("min_data_in_leaf", [1024, 2048, 3072]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.03, 0.05, 0.08]),
        "max_bin": trial.suggest_categorical("max_bin", [63, 127, 255]),
        "used_ram_limit": "16gb",
        "bootstrap_type": "MVS",
        # "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "grow_policy": "Depthwise"
    }

    if param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.6, 0.85)
    
    print(param)
    train_score_list, val_score_list = [], []
    for fold, (idx_tr, idx_va) in zip(range(1, 5+1), kf.split(train_agg, target)):
        fold = str(fold)
        X_train, y_train = train_agg.iloc[idx_tr], target[idx_tr]
        X_val, y_val = train_agg.iloc[idx_va], target[idx_va]
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=UserWarning)
            model = cb.CatBoostClassifier(**param)
            model.fit(
                X_train, 
                y_train, 
                eval_set=[(X_val, y_val)], 
                verbose=0, 
                cat_features=cat_features,
                early_stopping_rounds=100
            )
        y_train_pred = model.predict(X_train, prediction_type='RawFormulaVal')
        train_score, train_g, train_t4 = amex_metric(y_train, y_train_pred)
        train_data, X_train, y_train = None, None, None
        y_val_pred = model.predict(X_val, prediction_type='RawFormulaVal')
        val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)
        valid_data, X_val, y_val = None, None, None
        train_score_list.append(train_score)
        val_score_list.append(val_score)
        # if val_score > best_scores_json["validation"][fold]:
        #     best_scores_json["train"][fold] = train_score
        #     best_scores_json["validation"][fold] = val_score
        #     with open(f'{CURRENT_EXP_PATH}/best_scores.json', "w") as outfile:
        #         json.dump(best_scores_json, outfile)
        #     joblib.dump(model, f'{CURRENT_EXP_PATH}/models/model{fold}.pkl')
        # elif np.mean(train_score_list) >= np.mean(list(best_scores_json["train"].values())) + 0.02:
        #     print(f"Train score too high (overfitting), start a new trial")
        #     return np.mean(val_score_list)
        print(f"{Fore.BLUE}{Style.BRIGHT}Fold {fold} | Train Score = {train_score:.5f} ({train_g:.4f}, {train_t4:.4f})")
        print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | Val Score = {val_score:.5f} ({val_g:.4f}, {val_t4:.4f}){Style.RESET_ALL}")
        print(f"Clear cache {gc.collect()}")
        
    return np.mean(val_score_list)

In [110]:
# study = joblib.load(f"{CURRENT_EXP_PATH}/optuna_study.pkl")

In [168]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-08-20 07:47:35,410][0m A new study created in memory with name: no-name-babff6bd-89f7-4787-8910-ed8fc676e224[0m


In [169]:
study.optimize(objective, n_trials=20)

{'objective': 'Logloss', 'colsample_bylevel': 0.04102624715853323, 'depth': 11, 'random_strength': 1.2633436553615531, 'class_weights': [1, 2], 'n_estimators': 1000, 'l2_leaf_reg': 7.532364531307307, 'min_data_in_leaf': 3072, 'learning_rate': 0.03, 'max_bin': 255, 'used_ram_limit': '16gb', 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise'}
[34m[1mFold 1 | Train Score = 0.85778 (0.9509, 0.7646)
[32m[1mFold 1 | Val Score = 0.79111 (0.9217, 0.6606)[0m
Clear cache 85219
[34m[1mFold 2 | Train Score = 0.85752 (0.9509, 0.7641)
[32m[1mFold 2 | Val Score = 0.79243 (0.9229, 0.6620)[0m
Clear cache 0
[34m[1mFold 3 | Train Score = 0.85710 (0.9506, 0.7636)
[32m[1mFold 3 | Val Score = 0.79440 (0.9244, 0.6644)[0m
Clear cache 0
[34m[1mFold 4 | Train Score = 0.85682 (0.9505, 0.7632)
[32m[1mFold 4 | Val Score = 0.79502 (0.9245, 0.6656)[0m
Clear cache 0


[32m[I 2022-08-20 08:36:29,594][0m Trial 0 finished with value: 0.7938155963625977 and parameters: {'colsample_bylevel': 0.04102624715853323, 'depth': 11, 'random_strength': 1.2633436553615531, 'class_weights': [1, 2], 'n_estimators': 1000, 'l2_leaf_reg': 7.532364531307307, 'min_data_in_leaf': 3072, 'learning_rate': 0.03, 'max_bin': 255}. Best is trial 0 with value: 0.7938155963625977.[0m


[34m[1mFold 5 | Train Score = 0.85823 (0.9511, 0.7653)
[32m[1mFold 5 | Val Score = 0.79612 (0.9253, 0.6669)[0m
Clear cache 0
{'objective': 'Logloss', 'colsample_bylevel': 0.05402675785295788, 'depth': 11, 'random_strength': 1.219098359987579, 'class_weights': [1, 1.25], 'n_estimators': 750, 'l2_leaf_reg': 4.280904816220738, 'min_data_in_leaf': 3072, 'learning_rate': 0.03, 'max_bin': 63, 'used_ram_limit': '16gb', 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise'}
[34m[1mFold 1 | Train Score = 0.85091 (0.9461, 0.7557)
[32m[1mFold 1 | Val Score = 0.79088 (0.9211, 0.6606)[0m
Clear cache 0
[34m[1mFold 2 | Train Score = 0.85169 (0.9459, 0.7575)
[32m[1mFold 2 | Val Score = 0.79087 (0.9226, 0.6592)[0m
Clear cache 0
[34m[1mFold 3 | Train Score = 0.85210 (0.9462, 0.7580)
[32m[1mFold 3 | Val Score = 0.79501 (0.9240, 0.6660)[0m
Clear cache 0
[34m[1mFold 4 | Train Score = 0.85088 (0.9458, 0.7560)
[32m[1mFold 4 | Val Score = 0.79431 (0.9237, 0.6649)[0m
Clear cache 0


[32m[I 2022-08-20 09:22:46,751][0m Trial 1 finished with value: 0.793635049670822 and parameters: {'colsample_bylevel': 0.05402675785295788, 'depth': 11, 'random_strength': 1.219098359987579, 'class_weights': [1, 1.25], 'n_estimators': 750, 'l2_leaf_reg': 4.280904816220738, 'min_data_in_leaf': 3072, 'learning_rate': 0.03, 'max_bin': 63}. Best is trial 0 with value: 0.7938155963625977.[0m


[34m[1mFold 5 | Train Score = 0.85071 (0.9457, 0.7557)
[32m[1mFold 5 | Val Score = 0.79711 (0.9250, 0.6692)[0m
Clear cache 0
{'objective': 'Logloss', 'colsample_bylevel': 0.034570851765460854, 'depth': 12, 'random_strength': 1.3064973233129076, 'class_weights': [1, 1.25], 'n_estimators': 1000, 'l2_leaf_reg': 5.216846250136881, 'min_data_in_leaf': 1024, 'learning_rate': 0.03, 'max_bin': 127, 'used_ram_limit': '16gb', 'bootstrap_type': 'MVS', 'grow_policy': 'Depthwise'}
[34m[1mFold 1 | Train Score = 0.91794 (0.9688, 0.8670)
[32m[1mFold 1 | Val Score = 0.79359 (0.9220, 0.6652)[0m
Clear cache 0
[34m[1mFold 2 | Train Score = 0.91769 (0.9691, 0.8663)
[32m[1mFold 2 | Val Score = 0.79064 (0.9233, 0.6580)[0m
Clear cache 0



KeyboardInterrupt



In [121]:
df = study.trials_dataframe()

In [129]:
df["params_class_weights"] = df["params_class_weights"].astype(str)

In [130]:
df.columns

Index(['number', 'value', 'datetime_start', 'datetime_complete', 'duration',
       'params_bootstrap_type', 'params_class_weights',
       'params_colsample_bylevel', 'params_depth', 'params_l2_leaf_reg',
       'params_learning_rate', 'params_max_bin', 'params_min_data_in_leaf',
       'params_n_estimators', 'params_objective', 'params_random_strength',
       'params_subsample', 'state'],
      dtype='object')

In [145]:
df.groupby("params_subsample")["value"].mean()

params_subsample
0.600639    0.790795
0.769839    0.787349
0.795663    0.791467
Name: value, dtype: float64

In [113]:
study.best_trial

FrozenTrial(number=1, values=[0.7933261197848103], datetime_start=datetime.datetime(2022, 8, 19, 23, 9, 1, 229942), datetime_complete=datetime.datetime(2022, 8, 19, 23, 43, 0, 470587), params={'objective': 'Logloss', 'colsample_bylevel': 0.048626854325478676, 'depth': 11, 'bootstrap_type': 'MVS', 'random_strength': 1.2897439703972426, 'class_weights': [1, 1.5], 'n_estimators': 500, 'l2_leaf_reg': 4.176816737398486, 'min_data_in_leaf': 2048, 'learning_rate': 0.05, 'max_bin': 63}, distributions={'objective': CategoricalDistribution(choices=('Logloss',)), 'colsample_bylevel': LogUniformDistribution(high=0.1, low=0.03), 'depth': IntUniformDistribution(high=12, low=4, step=1), 'bootstrap_type': CategoricalDistribution(choices=('Bernoulli', 'MVS')), 'random_strength': UniformDistribution(high=2.0, low=1.0), 'class_weights': CategoricalDistribution(choices=([1, 1.25], [1, 1.5], [1, 2])), 'n_estimators': CategoricalDistribution(choices=(500, 750, 1000)), 'l2_leaf_reg': LogUniformDistribution(h

In [117]:
# joblib.dump(study, f"../catboost_optuna_study.pkl")

In [62]:
study_df = study.trials_dataframe()

In [11]:
# for col in get_cols(study_df, "params"):
#     sns.scatterplot(data=study_df, x=study_df[col], y=study_df["value"])
#     plt.show()

### Train Catboost

In [77]:
param = {
    "objective": "Logloss",
    "colsample_bylevel": 0.05,
    "depth": 6,
    "boosting_type": "Ordered",
    "bootstrap_type": "MVS",
    "n_estimators": 1000,
    "l2_leaf_reg": 3,
    "min_data_in_leaf": 1024,
    "learning_rate": 0.05,
    "max_bin": 127,
    "used_ram_limit": "16gb",
}

In [78]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1020)

In [79]:
train_score_list, val_score_list = [], []
for fold, (idx_tr, idx_va) in zip(range(1, 5+1), kf.split(train_agg, target)):
    fold = str(fold)
    X_train, y_train = train_agg.iloc[idx_tr], target[idx_tr]
    X_val, y_val = train_agg.iloc[idx_va], target[idx_va]
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model = cb.CatBoostClassifier(**param)
        model.fit(
            X_train, 
            y_train, 
            eval_set=[(X_val, y_val)], 
            verbose=100, 
            cat_features=cat_features,
            early_stopping_rounds=100
        )
    y_train_pred = model.predict(X_train, prediction_type='RawFormulaVal')
    train_score, train_g, train_t4 = amex_metric(y_train, y_train_pred)
    train_data, X_train, y_train = None, None, None
    y_val_pred = model.predict(X_val, prediction_type='RawFormulaVal')
    val_score, val_g, val_t4 = amex_metric(y_val, y_val_pred)
    valid_data, X_val, y_val = None, None, None
    train_score_list.append(train_score)
    val_score_list.append(val_score)
    print(f"{Fore.BLUE}{Style.BRIGHT}Fold {fold} | Train Score = {train_score:.5f} ({train_g:.4f}, {train_t4:.4f})")
    print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | Val Score = {val_score:.5f} ({val_g:.4f}, {val_t4:.4f}){Style.RESET_ALL}")
    print(f"Clear cache {gc.collect()}")

0:	learn: 0.6434646	test: 0.6435511	best: 0.6435511 (0)	total: 231ms	remaining: 3m 50s
100:	learn: 0.2661354	test: 0.2709502	best: 0.2709502 (100)	total: 15s	remaining: 2m 13s
200:	learn: 0.2598474	test: 0.2656707	best: 0.2656707 (200)	total: 29.7s	remaining: 1m 58s
300:	learn: 0.2562077	test: 0.2629458	best: 0.2629458 (300)	total: 44.2s	remaining: 1m 42s
400:	learn: 0.2535508	test: 0.2614296	best: 0.2614296 (400)	total: 59s	remaining: 1m 28s
500:	learn: 0.2514180	test: 0.2604717	best: 0.2604717 (500)	total: 1m 13s	remaining: 1m 13s
600:	learn: 0.2495896	test: 0.2598555	best: 0.2598555 (600)	total: 1m 28s	remaining: 58.7s
700:	learn: 0.2479837	test: 0.2593157	best: 0.2593157 (700)	total: 1m 43s	remaining: 44s
800:	learn: 0.2464774	test: 0.2588889	best: 0.2588889 (800)	total: 1m 58s	remaining: 29.3s
900:	learn: 0.2450839	test: 0.2586632	best: 0.2586565 (897)	total: 2m 13s	remaining: 14.6s
999:	learn: 0.2438038	test: 0.2584054	best: 0.2584054 (999)	total: 2m 27s	remaining: 0us

bestTest 

In [64]:
# experiment_dict = {}

In [80]:
# experiment_dict["fill_negative_127"] = {"train_scores": train_score_list, "val_scores": val_score_list}

In [49]:
np.mean(experiment_dict["no_impute"]["train_scores"]), np.mean(experiment_dict["no_impute"]["val_scores"])

(0.7618067142207205, 0.7390578755925745)

In [50]:
np.mean(experiment_dict["fill_zero"]["train_scores"]), np.mean(experiment_dict["fill_zero"]["val_scores"])

(0.7618417595776765, 0.7386932469726997)

In [66]:
np.mean(experiment_dict["fill_negative_one"]["train_scores"]), np.mean(experiment_dict["fill_negative_one"]["val_scores"])

(0.7618861119180279, 0.7385532351690534)

In [81]:
np.mean(experiment_dict["fill_negative_127"]["train_scores"]), np.mean(experiment_dict["fill_negative_127"]["val_scores"])

(0.7618463586373094, 0.7391923808237963)