# Optuna with SKLearn and Cross-Validation

- Notice that the test data is not used during hyperparameter optimization. This is crucial to avoid data leakage and ensure that the model's performance is evaluated on unseen data.

In [None]:
import pandas as pd
import optuna

In [39]:
from sklearn.datasets import load_diabetes

dataset = load_diabetes()
_X = dataset.data
_y = dataset.target
print(_X.shape, _y.shape)

(442, 10) (442,)


In [40]:
from sklearn.model_selection import train_test_split

_X_train, _X_test, _y_train, _y_test = train_test_split(
    _X, _y, test_size=0.3, random_state=42
)

In [41]:
from sklearn.preprocessing import StandardScaler

scX = StandardScaler()
_X_train = scX.fit_transform(_X_train)
_X_test = scX.transform(_X_test)

scY = StandardScaler()
_y_train = scY.fit_transform(_y_train.reshape(-1, 1)).flatten()
_y_test = scY.transform(_y_test.reshape(-1, 1)).flatten()

In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 200)
    max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
    forrest = RandomForestRegressor(
        n_estimators=n_estimators, max_depth=max_depth, random_state=42
    )
    scores = cross_val_score(
        forrest, _X_train, _y_train, cv=3, scoring="neg_mean_squared_error"
    )
    mse = -scores.mean()  # We want to minimize MSE
    return mse

In [45]:
study = optuna.create_study()
study.optimize(objective, n_trials=50)

[I 2025-12-24 11:52:25,041] A new study created in memory with name: no-name-d41e380a-f7ce-434f-8955-da93e6a44754
[I 2025-12-24 11:52:25,513] Trial 0 finished with value: 0.5726719347723043 and parameters: {'n_estimators': 185, 'max_depth': 5}. Best is trial 0 with value: 0.5726719347723043.
[I 2025-12-24 11:52:25,804] Trial 1 finished with value: 0.5812527284789675 and parameters: {'n_estimators': 161, 'max_depth': 2}. Best is trial 0 with value: 0.5726719347723043.
[I 2025-12-24 11:52:26,332] Trial 2 finished with value: 0.5841166695412497 and parameters: {'n_estimators': 176, 'max_depth': 8}. Best is trial 0 with value: 0.5726719347723043.
[I 2025-12-24 11:52:26,574] Trial 3 finished with value: 0.5820269339026526 and parameters: {'n_estimators': 136, 'max_depth': 2}. Best is trial 0 with value: 0.5726719347723043.
[I 2025-12-24 11:52:26,862] Trial 4 finished with value: 0.5801491337668284 and parameters: {'n_estimators': 88, 'max_depth': 24}. Best is trial 0 with value: 0.572671934

In [46]:
best_params = study.best_params
print("Best parameters:", best_params)
print("Best MSE:", study.best_value)


Best parameters: {'n_estimators': 199, 'max_depth': 3}
Best MSE: 0.570794829009074


In [47]:
# Calculate Test Result
from sklearn.metrics import mean_squared_error

forrest = RandomForestRegressor(**best_params, random_state=42)
forrest.fit(_X_train, _y_train)
y_pred = forrest.predict(_X_test)
mse = mean_squared_error(_y_test, y_pred)
print("Test MSE:", mse)

Test MSE: 0.4436299555652864
