In [1]:
import pandas as pd
import numpy as np

import optuna

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import joblib

  from .autonotebook import tqdm as notebook_tqdm


#### **Getting training && validation data**

In [2]:
df_train = pd.read_csv('../databases/training.csv')
df_valid = pd.read_csv('../databases/validation.csv')

In [3]:
X_train, y_train = df_train.drop(['IND_BOM_1_1'], axis=1), df_train['IND_BOM_1_1']
X_val, y_val = df_valid.drop(['IND_BOM_1_1'], axis=1), df_valid['IND_BOM_1_1']

In [4]:
X_train = np.array(X_train)
X_val = np.array(X_val)

#### **Parameters selection**

In [5]:
params = {
    'max_depth': (2, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
    'n_estimators': (50, 200),
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}

def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', *params['n_estimators']),
        max_depth=trial.suggest_int('max_depth', *params['max_depth']),
        min_samples_split=trial.suggest_int('min_samples_split', *params['min_samples_split']),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', *params['min_samples_leaf']),
        criterion=trial.suggest_categorical('criterion', params['criterion']),
        max_features=trial.suggest_categorical('max_features', params['max_features']),
        verbose=1,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    y_probas = model.predict_proba(X_val)

    loss = log_loss(y_val, y_probas)

    return loss

In [6]:
n_trials = 50

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials)

[32m[I 2023-04-29 16:56:37,939][0m A new study created in memory with name: no-name-b646f474-95e2-48e2-a29b-0e7b502cd02d[0m
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:   13.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 112 out of 112 | elapsed:    1.0s finished
[32m[I 2023-04-29 16:56:53,138][0m Trial 0 finished with value: 0.6185293434785861 and parameters: {'n_estimators': 112, 'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 10, 'criterion': 'entropy', 'max_features': 'log2'}. Best is trial 0 with value: 0.6185293434785861.[0m
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.3s
[Parallel(n_job

#### **Saving study**

In [7]:
save_path = './optuna_studies/random_forest_study.pkl'

joblib.dump(study, save_path)

['./optuna_studies/random_forest.pkl']