In [1]:
import pandas as pd
import numpy as np

import optuna

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss
import joblib

  from .autonotebook import tqdm as notebook_tqdm


#### **Getting training && validation data**

In [2]:
df_train = pd.read_csv('../databases/training.csv')
df_valid = pd.read_csv('../databases/validation.csv')

In [3]:
X_train, y_train = df_train.drop(['IND_BOM_1_1'], axis=1), df_train['IND_BOM_1_1']
X_val, y_val = df_valid.drop(['IND_BOM_1_1'], axis=1), df_valid['IND_BOM_1_1']

In [4]:
X_train = np.array(X_train)
X_val = np.array(X_val)

#### **Parameters selection**

In [5]:
def objective(trial):
    model = DecisionTreeClassifier(
        max_depth=trial.suggest_int('max_depth', 2, 50),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        criterion='gini',
        splitter=trial.suggest_categorical('splitter', ['best', 'random'])
    )

    model.fit(X_train, y_train)

    y_probas = model.predict_proba(X_val)

    loss = log_loss(y_val, y_probas)

    return loss

In [6]:
n_trials = 100

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials)

[32m[I 2023-04-29 16:15:13,389][0m A new study created in memory with name: no-name-eadea24b-a2d5-4690-ae19-1b371325c2b7[0m
[32m[I 2023-04-29 16:15:16,603][0m Trial 0 finished with value: 4.750687718149069 and parameters: {'max_depth': 48, 'min_samples_split': 16, 'min_samples_leaf': 6, 'splitter': 'random'}. Best is trial 0 with value: 4.750687718149069.[0m
[32m[I 2023-04-29 16:15:19,816][0m Trial 1 finished with value: 4.767634331371628 and parameters: {'max_depth': 44, 'min_samples_split': 4, 'min_samples_leaf': 7, 'splitter': 'random'}. Best is trial 0 with value: 4.750687718149069.[0m
[32m[I 2023-04-29 16:15:32,715][0m Trial 2 finished with value: 5.93686613711829 and parameters: {'max_depth': 45, 'min_samples_split': 3, 'min_samples_leaf': 8, 'splitter': 'best'}. Best is trial 0 with value: 4.750687718149069.[0m
[32m[I 2023-04-29 16:15:40,905][0m Trial 3 finished with value: 1.2785817960737238 and parameters: {'max_depth': 12, 'min_samples_split': 17, 'min_samples_l

#### **Saving study**

In [7]:
save_path = './optuna_studies/decision_tree_study.pkl'

joblib.dump(study, save_path)

['./optuna_studies/decision_tree_study.pkl']