In [19]:
import datetime
from sklearn import metrics, model_selection
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [None]:
try:
    from catboost import Pool, CatBoostRegressor, cv
except:
    !pip install catboost 
    from catboost import Pool, CatBoostRegressor, cv

In [None]:
try:
    from CustomPipeline import *
except:
    import sys
    sys.path.insert(0,'/content/sample_data')
    from CustomPipeline import *
    print("ok")

In [20]:
try:
    train = pd.read_csv("./train_anomaly.csv", delimiter=",", sep='.')
except:
    train = pd.read_csv("./sample_data/train_anomaly.csv", delimiter=",", sep='.')

In [None]:
X = train.drop(["target", "id"], axis=1)
y = train["target"]

In [23]:
num_train = X.select_dtypes([int, float])
cat_train = X.select_dtypes(object)

num = list(num_train)
cat = list(cat_train)

idx = [X.columns.get_loc(i) for i in cat]

In [28]:
train_pool = Pool(X, y, cat_features=idx)

def objective(trial):

#     param = {
#         "iterations": 200, 
#         "loss_function": "RMSE",
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#         "depth": trial.suggest_int("depth", 1, 12),
#         "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#         "bootstrap_type": trial.suggest_categorical(
#             "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
#         ),
#         "used_ram_limit": "3gb",
#     }

#     if param["bootstrap_type"] == "Bayesian":
#         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
#     elif param["bootstrap_type"] == "Bernoulli":
#         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    param = {
        'iterations' : trial.suggest_int('iterations', 100, 500),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                                    
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-3, 0.5),
    }
    param["loss_function"] ="RMSE"
    
    rmse = cv(train_pool, param, fold_count=5, plot=False, logging_level="Silent")

    return rmse['test-RMSE-mean'].iloc[-1]

In [None]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=20, n_jobs=-1)

[32m[I 2021-08-20 19:11:00,419][0m A new study created in memory with name: no-name-11035aae-3115-43a8-aa50-6b9fa5496964[0m


In [None]:
best_params = study.best_trial.params
best_params

# save configs

In [None]:
import yaml
import os


directory = './Configs'
if not os.path.exists(directory):
    os.makedirs(directory)
    
    
with open('./Configs/params_model_catboost.yaml', 'w') as outfile:
    yaml.dump(best_params, outfile, default_flow_style=False)