In [16]:
import datetime
from sklearn import metrics, model_selection
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import lightgbm as lgb

In [2]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [37]:
try:
    train = pd.read_csv("./train_anomaly.csv", delimiter=",", sep='.')
except:
    train = pd.read_csv("./sample_data/train_for_crossval.csv", delimiter=",", sep='.')

In [38]:
train, val = train_test_split(train, test_size=0.99)

In [40]:
X = train.drop(["target", "id"], axis=1)
y = train["target"]

In [41]:
num_train = train.select_dtypes([int, float])
cat_train = train.select_dtypes(object)

num = list(num_train.drop(['target'],axis=1))
cat = list(cat_train)

idx = [X.columns.get_loc(i) for i in cat]

In [46]:
rmse = make_scorer(mean_squared_error, squared=False)

train_data = lgb.Dataset(X, label=y, categorical_feature=cat)
lgb_eval = lgb.Dataset(val.drop(["target", "id"], axis=1), val["target"], reference=train_data, categorical_feature=idx)

def objective(trial):
    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    gbm = lgb.train(lgb_params, train_data)
    y_pred = gbm.predict(lgb_eval, num_iteration=gbm.best_iteration)
    
    result = mean_squared_error(val["target"], y_pred, squared=False)

    return result

In [47]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=10, timeout=600)

[32m[I 2021-08-25 22:08:43,209][0m A new study created in memory with name: no-name-33db4913-c63d-4440-81ec-f96e8e1b8dcf[0m
[33m[W 2021-08-25 22:08:43,217][0m Trial 0 failed because of the following error: ValueError('DataFrame.dtypes for data must be int, float or bool.\nDid not expect the data types in the following fields: cat0, cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, anomaly_col')
Traceback (most recent call last):
  File "C:\Users\User\AppData\Roaming\Python\Python37\site-packages\optuna\_optimize.py", line 211, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-46-2240baac99a4>", line 19, in objective
    gbm = lgb.train(lgb_params, train_data)
  File "C:\ProgramData\Anaconda3\lib\site-packages\lightgbm\engine.py", line 228, in train
    booster = Booster(params=params, train_set=train_set)
  File "C:\ProgramData\Anaconda3\lib\site-packages\lightgbm\basic.py", line 2229, in __init__
    train_set.construct()
  File "C:\ProgramData\Anaconda3\

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: cat0, cat1, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9, anomaly_col

In [16]:
best_params = study.best_trial.params
best_params

{'alpha': 0.09244405634965566, 'n_bins': 63}

# save configs

In [None]:
import yaml
import os


directory = './Configs'
if not os.path.exists(directory):
    os.makedirs(directory)
    
    
with open('./Configs/params_model_lgbm.yaml', 'w') as outfile:
    yaml.dump(best_params, outfile, default_flow_style=False)