In [297]:
import xgboost as xgb
import catboost
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from bayes_opt import BayesianOptimization
import numpy as np

In [298]:
path = "/Users/taehoon/Documents/dacon_movies/data/data"
route = path+"/preprocessed_data/preprocessed_"
train, test = pd.read_csv(route+"train.csv").drop("Unnamed: 0",axis=1), pd.read_csv(route+"test.csv").drop("Unnamed: 0", axis=1)


In [299]:
def label_encoding(train, test):
    all_df = pd.concat([train, test], ignore_index=True)
    label_encoder = {}
    str_columns = [(i, j) for i, j in zip(train.dtypes, train.columns)]
    for tup in str_columns:
        dtype, column = tup
        if dtype != "object":
            continue
        val2idx = {i:j for j, i in enumerate(all_df[column].unique())}
        label_encoder[column] = val2idx
        train[column] = train[column].map(val2idx)
        test[column] = test[column].map(val2idx)
    return train, test, label_encoder
train, test, _ = label_encoding(train, test)

In [300]:
train_x, test_x, train_y, test_y = train_test_split(train.drop("box_off_num",axis=1),train["box_off_num"])

def cbt_reg(n_estimators, depth, learning_rate, subsample, l2_leaf_reg):
    params = {
        "n_estimators":int(n_estimators),
        "depth":int(depth) ,
        "learning_rate":learning_rate ,
        "subsample":subsample ,
        "l2_leaf_reg":l2_leaf_reg ,
        }
    cbtr_model = catboost.CatBoostRegressor(
                             **params,
                             bootstrap_type='Bernoulli',
                             eval_metric='RMSE',
                             od_type='Iter',
                             allow_writing_files=False)
    cbtr_model.fit(train_x, train_y, silent=True)
    y_pred = cbtr_model.predict(test_x)
    rmse = mean_squared_error(test_y, y_pred, squared=False)
    r2 = r2_score(test_y, y_pred)
    return 1-rmse

In [301]:
pbounds = {"n_estimators": (150,400),
           "depth": (2,7),
           "learning_rate": (.01, 0.2),
           "subsample":(0.6, 1.),
           "l2_leaf_reg":(0,10),
}
bo = BayesianOptimization(f=cbt_reg, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=2, n_iter=50, acq='ei', xi=0.01)
high_score = bo.max
1-cbt_reg(**high_score["params"])

|   iter    |  target   |   depth   | l2_lea... | learni... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-1.273e+0[0m | [0m3.873    [0m | [0m9.507    [0m | [0m0.1491   [0m | [0m299.7    [0m | [0m0.6624   [0m |
| [0m2        [0m | [0m-1.293e+0[0m | [0m2.78     [0m | [0m0.5808   [0m | [0m0.1746   [0m | [0m300.3    [0m | [0m0.8832   [0m |
| [95m3        [0m | [95m-1.15e+06[0m | [95m3.936    [0m | [95m6.911    [0m | [95m0.05903  [0m | [95m358.0    [0m | [95m0.8439   [0m |
| [95m4        [0m | [95m-1.026e+0[0m | [95m4.532    [0m | [95m7.151    [0m | [95m0.02087  [0m | [95m358.0    [0m | [95m0.6805   [0m |
| [95m5        [0m | [95m-9.961e+0[0m | [95m4.796    [0m | [95m7.258    [0m | [95m0.01     [0m | [95m358.1    [0m | [95m0.6079   [0m |
| [95m6        [0m | [95m-9.934e+0[0m | [95m5.447    [0m | [95m7.086    [0m | [95m0.01  