In [None]:
# ===============================
# Deep Neural Network from Scratch using Optuna for Hyperparameter Optimization
# Dataset: Iris (Multiclass Classification)
# ===============================

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from functools import partial
import optuna


# =======================================
# 1. Load and Preprocess Data
# =======================================
iris = datasets.load_iris()
X = iris.data
y = iris.target.reshape(-1, 1)

# Normalize features for better convergence
scaler = RobustScaler()
X = scaler.fit_transform(X)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y)

def train_val_test_split(X, y, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Sizes must sum to 1"

    # First split: train vs temp (val+test)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1 - train_size), random_state=random_state, stratify=y)

    # Compute proportion of validation relative to temp
    val_prop = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(1 - val_prop), random_state=random_state, stratify=y_temp)

    return X_train, X_val, X_test, y_train, y_val, y_test

# Usage:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y)
print(X_train.shape, X_val.shape, X_test.shape)


# =======================================
# 2. Activations
# =======================================
def sigmoid(x, is_derivative=False):
    if is_derivative:
        return np.exp(-x) / ((1 + np.exp(-x)) ** 2)
    return 1 / (1 + np.exp(-x))

def tanh(x, is_derivative=False):
    if is_derivative:
        return 1 - np.tanh(x) ** 2
    return np.tanh(x)

def relu(x, is_derivative=False):
    if is_derivative:
        return (x > 0).astype(float)
    return np.maximum(0, x)

def leaky_relu(x, is_derivative=False, alpha=0.01):
    if is_derivative:
        x = np.where(x < 0, alpha, 1)
        return x
    return np.maximum(alpha * x, x)

def softmax(x):
    exp_shifted = np.exp(x - np.max(x, axis=0, keepdims=True))
    return exp_shifted / np.sum(exp_shifted, axis=0, keepdims=True)


# =======================================
# 3. Loss Functions
# =======================================
def mse_loss(Y, Y_pred):
    m = Y.shape[0]
    return np.sum((Y - Y_pred.T) ** 2) / (2 * m)

def cross_entropy_loss(Y, Y_pred, eps=1e-9):
    m = Y.shape[0]
    return -np.sum(Y * np.log(Y_pred.T + eps)) / m


# =======================================
# 4. Initialization
# =======================================
def initialize_parameters(layer_dims):
    np.random.seed(42)
    params = {}
    for i in range(1, len(layer_dims)):
        params[f"W{i}"] = np.random.randn(layer_dims[i], layer_dims[i - 1]) * np.sqrt(1. / layer_dims[i - 1])
        params[f"b{i}"] = np.zeros((layer_dims[i], 1))
    return params


# =======================================
# 5. Forward
# =======================================
def forward_propagation(X, params, activations, dropout_rates=None, training=True):
    cache = {"A0": X.T}
    L = len(activations)
    dropout_masks = {}

    for i in range(1, L + 1):
        W, b = params[f"W{i}"], params[f"b{i}"]
        A_prev = cache[f"A{i-1}"]
        Z = np.dot(W, A_prev) + b

        if activations[i-1] == "sigmoid": A = sigmoid(Z)
        elif activations[i-1] == "relu": A = relu(Z)
        elif activations[i-1] == "tanh": A = tanh(Z)
        elif activations[i-1] == "leaky_relu": A = leaky_relu(Z)
        elif activations[i-1] == "softmax": A = softmax(Z)
        else: raise ValueError(f"Unsupported activation: {activations[i-1]}")

        # Dropout
        if training and dropout_rates and i <= len(dropout_rates):
            rate = dropout_rates[i - 1]
            mask = (np.random.rand(*A.shape) > rate).astype(float)
            A *= mask
            A /= (1 - rate)
            dropout_masks[f"M{i}"] = mask

        cache[f"Z{i}"], cache[f"A{i}"] = Z, A

    return cache[f"A{L}"], cache, dropout_masks


# =======================================
# 6. Backpropagation
# =======================================
def backward_propagation(Y, params, cache, activations, dropout_masks=None, dropout_rates=None, weight_decay=0.0):
    grads = {}
    m = Y.shape[0]
    L = len(activations)
    Y = Y.T
    dZ = cache[f"A{L}"] - Y

    for i in reversed(range(1, L + 1)):
        A_prev = cache[f"A{i-1}"]
        W = params[f"W{i}"]

        grads[f"dW{i}"] = (1 / m) * np.dot(dZ, A_prev.T) + weight_decay * W
        grads[f"db{i}"] = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
        grads = {k: np.clip(v, -1, 1) for k, v in grads.items()}

        if i > 1:
            Z_prev = cache[f"Z{i-1}"]
            dA_prev = np.dot(W.T, dZ)

            if dropout_masks and i - 1 in [int(k[1:]) for k in dropout_masks.keys()]:
                mask = dropout_masks[f"M{i-1}"]
                dA_prev *= mask
                dA_prev /= (1 - dropout_rates[i - 2])

            if activations[i - 2] == "sigmoid":
                dZ = dA_prev * sigmoid(Z_prev, is_derivative=True)
            elif activations[i - 2] == "relu":
                dZ = dA_prev * relu(Z_prev, is_derivative=True)
            elif activations[i - 2] == "tanh":
                dZ = dA_prev * tanh(Z_prev, is_derivative=True)
            elif activations[i - 2] == "leaky_relu":
                dZ = dA_prev * leaky_relu(Z_prev, is_derivative=True)

    return grads


# =======================================
# 7. Optimizers
# =======================================
'''
    Stochatic Gradient Descent (SGD):
    θ^(t+1) <- θ^t - η∇L(y, ŷ)

    Momentum:
    v^(t+1) <- βv^t + (1-β)∇L(y, ŷ)^t
    θ^(t+1) <- θ^t - ηv^(t+1)
'''
def update_parameters(params, grads, lr, optimizer_type, velocity=None, beta=0.9):
    if optimizer_type == "sgd":
        for i in range(1, len(params)//2 + 1):
            params[f"W{i}"] -= lr * grads[f"dW{i}"]
            params[f"b{i}"] -= lr * grads[f"db{i}"]

    elif optimizer_type == "momentum":
        if velocity is None:
            velocity = {k: np.zeros_like(v) for k, v in params.items()}

        for i in range(1, len(params)//2 + 1):
            velocity[f"W{i}"] = beta * velocity[f"W{i}"] + (1 - beta) * grads[f"dW{i}"]
            velocity[f"b{i}"] = beta * velocity[f"b{i}"] + (1 - beta) * grads[f"db{i}"]
            params[f"W{i}"] -= lr * velocity[f"W{i}"]
            params[f"b{i}"] -= lr * velocity[f"b{i}"]

    return params, velocity


# =======================================
# 8. Training Loop
# =======================================
def train_neural_network(
    X_train, 
    Y_train,
    X_val, 
    Y_val,
    hidden_layers=[10], 
    lr=0.01, 
    epochs=500,
    batch_size=16,
    activations=None, 
    loss_function="cross_entropy",
    optimizer_type="momentum",
    dropout_rates=[0.1],
    weight_decay=1e-4, 
    early_stopping=True, 
    patience=100,
    print_every=100
):
    input_dim, output_dim = X_train.shape[1], Y_train.shape[1]
    layer_dims = [input_dim] + hidden_layers + [output_dim]
    if activations is None:
        activations = ["relu"] * len(hidden_layers) + ["softmax"]

    params = initialize_parameters(layer_dims)
    velocity = None
    best_loss = np.inf
    patience_counter = 0

    for epoch in range(epochs):
        # Shuffle
        perm = np.random.permutation(X_train.shape[0])
        X_shuffled, Y_shuffled = X_train[perm], Y_train[perm]

        # Mini-batch iteration
        if batch_size is None:
            batches = [(X_shuffled, Y_shuffled)]
        else:
            batches = [(X_shuffled[i:i+batch_size], Y_shuffled[i:i+batch_size]) for i in range(0, X_shuffled.shape[0], batch_size)]

        # Training by batch
        epoch_loss = 0.0
        for X_batch, Y_batch in batches:
            A_out, cache, dropout_masks = forward_propagation(X_batch, params, activations, dropout_rates)
            loss = cross_entropy_loss(Y_batch, A_out)
            grads = backward_propagation(Y_batch, params, cache, activations, dropout_masks, dropout_rates, weight_decay)
            params, velocity = update_parameters(params, grads, lr, optimizer_type, velocity)
            epoch_loss += loss

        # Validation loss
        A_val, _, _ = forward_propagation(X_val, params, activations, training=False)
        val_loss = cross_entropy_loss(Y_val, A_val) if loss_function == "cross_entropy" else mse_loss(Y_val, A_val)
        loss = epoch_loss / len(batches)

        if epoch % print_every == 0:
            print(f"Epoch {epoch+1:3d}/{epochs} | Train Loss: {loss:.4f} | Val Loss: {val_loss:.4f}")


        # Early stopping
        if early_stopping:
            if val_loss < best_loss:
                best_loss = val_loss
                patience_counter = 0
                best_params = params.copy()
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    params = best_params
                    break

    return params


# =======================================
# 9. Prediction & Evaluation
# =======================================
def predict(X, params, activations):
    A_out, _, _ = forward_propagation(X, params, activations, training=False)
    return np.argmax(A_out, axis=0)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100


# =======================================
# 10. Hyperparameter Optimization with Optuna
# =======================================
def objective(trial, loss_function="cross_entropy", epochs=1000):
    # Hyperparameters to search
    n_layers = trial.suggest_int("n_layers", 1, 3)
    hidden_layers = [trial.suggest_int(f"n_units_l{i}", 5, 128) for i in range(n_layers)]
    lr = trial.suggest_float("lr", 1e-4, 0.1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64])
    dropout_rates = [trial.suggest_float(f"dropout_l{i}", 0.0, 0.5) for i in range(n_layers)]
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    optimizer_type = trial.suggest_categorical("optimizer", ["sgd", "momentum"])
    stopping_patience = trial.suggest_int("stopping_patience", 50, 500)
    # Activation functions per layer
    possible_activations = ["relu", "sigmoid", "tanh", "leaky_relu"]
    activations = [trial.suggest_categorical(f"activation_l{i}", possible_activations) for i in range(n_layers)]
    activations.append("softmax")  # output layer always softmax

    # Train network
    params = train_neural_network(
        X_train, 
        y_train,
        X_val, 
        y_val,
        hidden_layers=hidden_layers,
        lr=lr,
        epochs=epochs,
        batch_size=batch_size,
        activations=activations,
        loss_function=loss_function,
        optimizer_type=optimizer_type,
        dropout_rates=dropout_rates,
        weight_decay=weight_decay,
        early_stopping=True,
        patience=stopping_patience,
        print_every=100
    )

    # Optuna tries to minimize the loss function
    A_val, _, _ = forward_propagation(X_val, params, activations, training=False)
    val_loss = cross_entropy_loss(y_val, A_val) if loss_function == "cross_entropy" else mse_loss(y_val, A_val)
    return val_loss

# Run Optuna study
loss_function = "cross_entropy"
epochs = 1000
obj = partial(objective, loss_function=loss_function, epochs=epochs)
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(obj, n_trials=30)
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


# =======================================
# 11. Retrain Best Model on Train + Val and Evaluate on Test
# =======================================
best_params = trial.params
n_layers = best_params["n_layers"]
hidden_layers = [best_params[f"n_units_l{i}"] for i in range(n_layers)]
lr = best_params["lr"]
batch_size = best_params["batch_size"]
dropout_rates = [best_params[f"dropout_l{i}"] for i in range(n_layers)]
weight_decay = best_params["weight_decay"]
optimizer_type = best_params["optimizer"]
stopping_patience = best_params["stopping_patience"]
activations = [best_params[f"activation_l{i}"] for i in range(n_layers)]
activations.append("softmax")  # output layer always softmax

# Combine train and val sets
final_params = train_neural_network(
    X_train, 
    y_train,
    X_val, 
    y_val,
    hidden_layers=hidden_layers,
    lr=lr,
    epochs=epochs,
    batch_size=batch_size,
    activations=activations,
    loss_function=loss_function,
    optimizer_type=optimizer_type,
    dropout_rates=dropout_rates,
    weight_decay=weight_decay,
    early_stopping=True,
    patience=stopping_patience,
    print_every=100
)

# Evaluate on test set
y_test_pred = predict(X_test, final_params, activations)
y_test_true = np.argmax(y_test, axis=1)
test_accuracy = accuracy_score(y_test_true, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.2f}%")

[I 2025-10-25 15:36:13,537] A new study created in memory with name: no-name-ae0d4c8c-5584-451c-80bf-16526d0dd98c


(104, 4) (23, 4) (23, 4)
Epoch   1/1000 | Train Loss: 1.0897 | Val Loss: 1.0276
Epoch 101/1000 | Train Loss: 0.6610 | Val Loss: 0.6826
Epoch 201/1000 | Train Loss: 0.5436 | Val Loss: 0.5623
Epoch 301/1000 | Train Loss: 0.4724 | Val Loss: 0.5040
Epoch 401/1000 | Train Loss: 0.4478 | Val Loss: 0.4691
Epoch 501/1000 | Train Loss: 0.4102 | Val Loss: 0.4453
Epoch 601/1000 | Train Loss: 0.4061 | Val Loss: 0.4272
Epoch 701/1000 | Train Loss: 0.3693 | Val Loss: 0.4126


[I 2025-10-25 15:36:14,120] Trial 0 finished with value: 0.37874910825272123 and parameters: {'n_layers': 1, 'n_units_l0': 45, 'lr': 0.0005207830247539784, 'batch_size': 16, 'dropout_l0': 0.2314208190583974, 'weight_decay': 6.0034171424595095e-05, 'optimizer': 'momentum', 'stopping_patience': 436, 'activation_l0': 'tanh'}. Best is trial 0 with value: 0.37874910825272123.


Epoch 801/1000 | Train Loss: 0.3501 | Val Loss: 0.3998
Epoch 901/1000 | Train Loss: 0.3374 | Val Loss: 0.3887
Epoch   1/1000 | Train Loss: 1.0815 | Val Loss: 1.0435
Epoch 101/1000 | Train Loss: 0.5841 | Val Loss: 0.5791
Epoch 201/1000 | Train Loss: 0.4161 | Val Loss: 0.4617
Epoch 301/1000 | Train Loss: 0.3168 | Val Loss: 0.4042
Epoch 401/1000 | Train Loss: 0.2781 | Val Loss: 0.3651
Epoch 501/1000 | Train Loss: 0.2512 | Val Loss: 0.3340
Epoch 601/1000 | Train Loss: 0.2302 | Val Loss: 0.3075
Epoch 701/1000 | Train Loss: 0.2127 | Val Loss: 0.2845
Epoch 801/1000 | Train Loss: 0.2016 | Val Loss: 0.2642
Epoch 901/1000 | Train Loss: 0.1792 | Val Loss: 0.2476


[I 2025-10-25 15:36:14,804] Trial 1 finished with value: 0.2331473661518831 and parameters: {'n_layers': 2, 'n_units_l0': 21, 'n_units_l1': 124, 'lr': 0.0018926228140705523, 'batch_size': 32, 'dropout_l0': 0.008331749270429845, 'dropout_l1': 0.07679814715820144, 'weight_decay': 5.028146803675402e-06, 'optimizer': 'momentum', 'stopping_patience': 82, 'activation_l0': 'tanh', 'activation_l1': 'leaky_relu'}. Best is trial 1 with value: 0.2331473661518831.


Epoch   1/1000 | Train Loss: 1.2614 | Val Loss: 1.2206
Epoch 101/1000 | Train Loss: 1.1011 | Val Loss: 1.0817
Epoch 201/1000 | Train Loss: 1.0729 | Val Loss: 1.0585
Epoch 301/1000 | Train Loss: 1.0536 | Val Loss: 1.0458
Epoch 401/1000 | Train Loss: 1.0484 | Val Loss: 1.0319
Epoch 501/1000 | Train Loss: 1.0158 | Val Loss: 1.0141
Epoch 601/1000 | Train Loss: 1.0071 | Val Loss: 0.9919
Epoch 701/1000 | Train Loss: 0.9656 | Val Loss: 0.9638


[I 2025-10-25 15:36:15,379] Trial 2 finished with value: 0.8467704532641122 and parameters: {'n_layers': 3, 'n_units_l0': 65, 'n_units_l1': 39, 'n_units_l2': 7, 'lr': 0.004915785051576088, 'batch_size': 64, 'dropout_l0': 0.03950234985184875, 'dropout_l1': 0.46438685213491304, 'dropout_l2': 0.08055494328221519, 'weight_decay': 0.00024294026162951425, 'optimizer': 'sgd', 'stopping_patience': 84, 'activation_l0': 'sigmoid', 'activation_l1': 'relu', 'activation_l2': 'sigmoid'}. Best is trial 1 with value: 0.2331473661518831.


Epoch 801/1000 | Train Loss: 0.9538 | Val Loss: 0.9298
Epoch 901/1000 | Train Loss: 0.9140 | Val Loss: 0.8903
Epoch   1/1000 | Train Loss: 1.1012 | Val Loss: 1.0891
Epoch 101/1000 | Train Loss: 0.5907 | Val Loss: 0.5909
Epoch 201/1000 | Train Loss: 0.4390 | Val Loss: 0.4639
Epoch 301/1000 | Train Loss: 0.3751 | Val Loss: 0.3988
Epoch 401/1000 | Train Loss: 0.3514 | Val Loss: 0.3593
Epoch 501/1000 | Train Loss: 0.3130 | Val Loss: 0.3312
Epoch 601/1000 | Train Loss: 0.2690 | Val Loss: 0.3092
Epoch 701/1000 | Train Loss: 0.2714 | Val Loss: 0.2882
Epoch 801/1000 | Train Loss: 0.2201 | Val Loss: 0.2679


[I 2025-10-25 15:36:15,745] Trial 3 finished with value: 0.23210459163749428 and parameters: {'n_layers': 2, 'n_units_l0': 99, 'n_units_l1': 21, 'lr': 0.004052766545841572, 'batch_size': 64, 'dropout_l0': 0.06460630414893498, 'dropout_l1': 0.0047656502944994905, 'weight_decay': 9.435170827525996e-06, 'optimizer': 'sgd', 'stopping_patience': 445, 'activation_l0': 'relu', 'activation_l1': 'leaky_relu'}. Best is trial 3 with value: 0.23210459163749428.


Epoch 901/1000 | Train Loss: 0.2209 | Val Loss: 0.2492
Epoch   1/1000 | Train Loss: 1.1774 | Val Loss: 1.2078
Epoch 101/1000 | Train Loss: 0.7460 | Val Loss: 0.7004
Epoch 201/1000 | Train Loss: 0.5083 | Val Loss: 0.4879
Epoch 301/1000 | Train Loss: 0.3944 | Val Loss: 0.3851
Epoch 401/1000 | Train Loss: 0.3361 | Val Loss: 0.3264
Epoch 501/1000 | Train Loss: 0.3275 | Val Loss: 0.2806
Epoch 601/1000 | Train Loss: 0.2918 | Val Loss: 0.2439


[I 2025-10-25 15:36:16,012] Trial 4 finished with value: 0.14893865457223696 and parameters: {'n_layers': 2, 'n_units_l0': 31, 'n_units_l1': 20, 'lr': 0.008057178852216412, 'batch_size': 64, 'dropout_l0': 0.23324603704452251, 'dropout_l1': 0.21281893749470238, 'weight_decay': 4.132144012411828e-06, 'optimizer': 'sgd', 'stopping_patience': 291, 'activation_l0': 'relu', 'activation_l1': 'relu'}. Best is trial 4 with value: 0.14893865457223696.


Epoch 701/1000 | Train Loss: 0.2571 | Val Loss: 0.2136
Epoch 801/1000 | Train Loss: 0.2661 | Val Loss: 0.1874
Epoch 901/1000 | Train Loss: 0.1832 | Val Loss: 0.1657
Epoch   1/1000 | Train Loss: 1.2165 | Val Loss: 1.1526
Epoch 101/1000 | Train Loss: 1.0786 | Val Loss: 1.0792
Epoch 201/1000 | Train Loss: 1.0760 | Val Loss: 1.0365
Epoch 301/1000 | Train Loss: 1.0312 | Val Loss: 1.0072
Epoch 401/1000 | Train Loss: 1.0207 | Val Loss: 0.9834
Epoch 501/1000 | Train Loss: 0.9988 | Val Loss: 0.9624
Epoch 601/1000 | Train Loss: 1.0213 | Val Loss: 0.9425
Epoch 701/1000 | Train Loss: 0.9899 | Val Loss: 0.9244


[I 2025-10-25 15:36:16,478] Trial 5 finished with value: 0.8744105230292942 and parameters: {'n_layers': 1, 'n_units_l0': 61, 'lr': 0.00028085814484299235, 'batch_size': 32, 'dropout_l0': 0.3169257505754408, 'weight_decay': 0.0017455140739380862, 'optimizer': 'momentum', 'stopping_patience': 118, 'activation_l0': 'sigmoid'}. Best is trial 4 with value: 0.14893865457223696.


Epoch 801/1000 | Train Loss: 0.9118 | Val Loss: 0.9070
Epoch 901/1000 | Train Loss: 0.9354 | Val Loss: 0.8900
Epoch   1/1000 | Train Loss: 0.9743 | Val Loss: 1.0151
Epoch 101/1000 | Train Loss: 0.5200 | Val Loss: 0.5131
Epoch 201/1000 | Train Loss: 0.3827 | Val Loss: 0.4349
Epoch 301/1000 | Train Loss: 0.3244 | Val Loss: 0.3964
Epoch 401/1000 | Train Loss: 0.3472 | Val Loss: 0.3686
Epoch 501/1000 | Train Loss: 0.2361 | Val Loss: 0.3470
Epoch 601/1000 | Train Loss: 0.2587 | Val Loss: 0.3291
Epoch 701/1000 | Train Loss: 0.2328 | Val Loss: 0.3140
Epoch 801/1000 | Train Loss: 0.2347 | Val Loss: 0.3002


[I 2025-10-25 15:36:16,877] Trial 6 finished with value: 0.27989312219803814 and parameters: {'n_layers': 1, 'n_units_l0': 98, 'lr': 0.003004003376550795, 'batch_size': 32, 'dropout_l0': 0.13274719291331794, 'weight_decay': 0.00117508289065841, 'optimizer': 'momentum', 'stopping_patience': 492, 'activation_l0': 'leaky_relu'}. Best is trial 4 with value: 0.14893865457223696.


Epoch 901/1000 | Train Loss: 0.2147 | Val Loss: 0.2893
Epoch   1/1000 | Train Loss: 1.1217 | Val Loss: 1.0834
Epoch 101/1000 | Train Loss: 1.1199 | Val Loss: 1.0678
Epoch 201/1000 | Train Loss: 1.1112 | Val Loss: 1.0537
Epoch 301/1000 | Train Loss: 1.0983 | Val Loss: 1.0396
Epoch 401/1000 | Train Loss: 1.0405 | Val Loss: 1.0257
Epoch 501/1000 | Train Loss: 1.0255 | Val Loss: 1.0110
Epoch 601/1000 | Train Loss: 1.0179 | Val Loss: 0.9959
Epoch 701/1000 | Train Loss: 1.0498 | Val Loss: 0.9794
Epoch 801/1000 | Train Loss: 1.0322 | Val Loss: 0.9628
Epoch 901/1000 | Train Loss: 0.9787 | Val Loss: 0.9455


[I 2025-10-25 15:36:18,040] Trial 7 finished with value: 0.9279401314608274 and parameters: {'n_layers': 3, 'n_units_l0': 64, 'n_units_l1': 81, 'n_units_l2': 55, 'lr': 0.00027733851592973564, 'batch_size': 32, 'dropout_l0': 0.3685984562890927, 'dropout_l1': 0.34471301004476634, 'dropout_l2': 0.2399611660173076, 'weight_decay': 6.944715858749013e-06, 'optimizer': 'sgd', 'stopping_patience': 448, 'activation_l0': 'tanh', 'activation_l1': 'sigmoid', 'activation_l2': 'relu'}. Best is trial 4 with value: 0.14893865457223696.


Epoch   1/1000 | Train Loss: 1.0218 | Val Loss: 0.9810
Epoch 101/1000 | Train Loss: 0.1269 | Val Loss: 0.1901
Epoch 201/1000 | Train Loss: 0.0843 | Val Loss: 0.1297
Epoch 301/1000 | Train Loss: 0.0649 | Val Loss: 0.1031
Epoch 401/1000 | Train Loss: 0.0548 | Val Loss: 0.0768
Epoch 501/1000 | Train Loss: 0.0418 | Val Loss: 0.0456


[I 2025-10-25 15:36:18,586] Trial 8 finished with value: 0.05050242116216102 and parameters: {'n_layers': 3, 'n_units_l0': 37, 'n_units_l1': 122, 'n_units_l2': 40, 'lr': 0.025215586240535274, 'batch_size': 32, 'dropout_l0': 0.02939916410855542, 'dropout_l1': 0.38855223048652665, 'dropout_l2': 0.45935322923496974, 'weight_decay': 5.9682816984732e-06, 'optimizer': 'sgd', 'stopping_patience': 72, 'activation_l0': 'leaky_relu', 'activation_l1': 'leaky_relu', 'activation_l2': 'tanh'}. Best is trial 8 with value: 0.05050242116216102.


Early stopping triggered.
Epoch   1/1000 | Train Loss: 1.2902 | Val Loss: 1.1805
Epoch 101/1000 | Train Loss: 0.9492 | Val Loss: 0.9250
Epoch 201/1000 | Train Loss: 0.8368 | Val Loss: 0.6910
Epoch 301/1000 | Train Loss: 0.7157 | Val Loss: 0.5699
Epoch 401/1000 | Train Loss: 0.6115 | Val Loss: 0.5047
Epoch 501/1000 | Train Loss: 0.5576 | Val Loss: 0.4748
Epoch 601/1000 | Train Loss: 0.5480 | Val Loss: 0.4493
Epoch 701/1000 | Train Loss: 0.6040 | Val Loss: 0.4329
Epoch 801/1000 | Train Loss: 0.5530 | Val Loss: 0.4206


[I 2025-10-25 15:36:19,501] Trial 9 finished with value: 0.3961786033172614 and parameters: {'n_layers': 3, 'n_units_l0': 96, 'n_units_l1': 12, 'n_units_l2': 7, 'lr': 0.00859481386121662, 'batch_size': 32, 'dropout_l0': 0.04783503998965344, 'dropout_l1': 0.46646365616484264, 'dropout_l2': 0.41869594542248045, 'weight_decay': 5.7463618182687305e-06, 'optimizer': 'momentum', 'stopping_patience': 341, 'activation_l0': 'sigmoid', 'activation_l1': 'leaky_relu', 'activation_l2': 'relu'}. Best is trial 8 with value: 0.05050242116216102.


Epoch 901/1000 | Train Loss: 0.5530 | Val Loss: 0.4203
Epoch   1/1000 | Train Loss: 0.6496 | Val Loss: 0.4159
Epoch 101/1000 | Train Loss: 0.0746 | Val Loss: 0.0124
Epoch 201/1000 | Train Loss: 0.0598 | Val Loss: 0.0197
Epoch 301/1000 | Train Loss: 0.0113 | Val Loss: 0.0044
Epoch 401/1000 | Train Loss: 0.0380 | Val Loss: 0.0024
Epoch 501/1000 | Train Loss: 0.0196 | Val Loss: 0.0013
Epoch 601/1000 | Train Loss: 0.0070 | Val Loss: 0.0038
Epoch 701/1000 | Train Loss: 0.0079 | Val Loss: 0.0007
Epoch 801/1000 | Train Loss: 0.0032 | Val Loss: 0.0008
Epoch 901/1000 | Train Loss: 0.0043 | Val Loss: 0.0026


[I 2025-10-25 15:36:23,326] Trial 10 finished with value: 0.0008098299657081411 and parameters: {'n_layers': 3, 'n_units_l0': 128, 'n_units_l1': 125, 'n_units_l2': 128, 'lr': 0.08872966878731414, 'batch_size': 8, 'dropout_l0': 0.4719034296249297, 'dropout_l1': 0.30899072246268755, 'dropout_l2': 0.4933546876716215, 'weight_decay': 5.1478572239525056e-05, 'optimizer': 'sgd', 'stopping_patience': 211, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 0.8143 | Val Loss: 0.4127
Epoch 101/1000 | Train Loss: 0.1583 | Val Loss: 0.0319
Epoch 201/1000 | Train Loss: 0.0738 | Val Loss: 0.0222
Epoch 301/1000 | Train Loss: 0.0582 | Val Loss: 0.0198
Epoch 401/1000 | Train Loss: 0.0077 | Val Loss: 0.0051
Epoch 501/1000 | Train Loss: 0.0138 | Val Loss: 0.0038
Epoch 601/1000 | Train Loss: 0.0058 | Val Loss: 0.0020
Epoch 701/1000 | Train Loss: 0.0292 | Val Loss: 0.0043


[I 2025-10-25 15:36:26,326] Trial 11 finished with value: 0.002470345631258403 and parameters: {'n_layers': 3, 'n_units_l0': 125, 'n_units_l1': 121, 'n_units_l2': 127, 'lr': 0.06189492399438224, 'batch_size': 8, 'dropout_l0': 0.4760828039800009, 'dropout_l1': 0.31210321869749297, 'dropout_l2': 0.48060735702850393, 'weight_decay': 6.74794948918791e-05, 'optimizer': 'sgd', 'stopping_patience': 189, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Early stopping triggered.
Epoch   1/1000 | Train Loss: 0.7334 | Val Loss: 0.4292
Epoch 101/1000 | Train Loss: 0.0649 | Val Loss: 0.0250
Epoch 201/1000 | Train Loss: 0.0539 | Val Loss: 0.0274
Epoch 301/1000 | Train Loss: 0.0259 | Val Loss: 0.0053
Epoch 401/1000 | Train Loss: 0.0068 | Val Loss: 0.0076
Epoch 501/1000 | Train Loss: 0.0040 | Val Loss: 0.0033
Epoch 601/1000 | Train Loss: 0.0268 | Val Loss: 0.0092
Epoch 701/1000 | Train Loss: 0.0123 | Val Loss: 0.0014
Epoch 801/1000 | Train Loss: 0.0188 | Val Loss: 0.0013
Epoch 901/1000 | Train Loss: 0.0034 | Val Loss: 0.0013


[I 2025-10-25 15:36:29,755] Trial 12 finished with value: 0.0037352296020695995 and parameters: {'n_layers': 3, 'n_units_l0': 125, 'n_units_l1': 95, 'n_units_l2': 128, 'lr': 0.0637699925392565, 'batch_size': 8, 'dropout_l0': 0.4718696412948281, 'dropout_l1': 0.24413834719133418, 'dropout_l2': 0.49821229417496943, 'weight_decay': 4.581544437449348e-05, 'optimizer': 'sgd', 'stopping_patience': 188, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 0.8569 | Val Loss: 0.4838
Epoch 101/1000 | Train Loss: 0.0864 | Val Loss: 0.0789
Epoch 201/1000 | Train Loss: 0.0480 | Val Loss: 0.0380
Epoch 301/1000 | Train Loss: 0.0250 | Val Loss: 0.0030
Epoch 401/1000 | Train Loss: 0.0193 | Val Loss: 0.0122
Epoch 501/1000 | Train Loss: 0.0135 | Val Loss: 0.0092
Epoch 601/1000 | Train Loss: 0.0207 | Val Loss: 0.0017
Epoch 701/1000 | Train Loss: 0.0228 | Val Loss: 0.0020


[I 2025-10-25 15:36:32,598] Trial 13 finished with value: 0.0019782347225033095 and parameters: {'n_layers': 3, 'n_units_l0': 128, 'n_units_l1': 101, 'n_units_l2': 125, 'lr': 0.0927274240322479, 'batch_size': 8, 'dropout_l0': 0.49369103676843357, 'dropout_l1': 0.31410305495149665, 'dropout_l2': 0.3345539695248506, 'weight_decay': 0.0002841899248422817, 'optimizer': 'sgd', 'stopping_patience': 197, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch 801/1000 | Train Loss: 0.0092 | Val Loss: 0.0013
Early stopping triggered.
Epoch   1/1000 | Train Loss: 0.8711 | Val Loss: 0.6488
Epoch 101/1000 | Train Loss: 0.0911 | Val Loss: 0.1425
Epoch 201/1000 | Train Loss: 0.0641 | Val Loss: 0.1126
Epoch 301/1000 | Train Loss: 0.1095 | Val Loss: 0.0956
Epoch 401/1000 | Train Loss: 0.0865 | Val Loss: 0.0677
Epoch 501/1000 | Train Loss: 0.0637 | Val Loss: 0.0561
Epoch 601/1000 | Train Loss: 0.0284 | Val Loss: 0.0525
Epoch 701/1000 | Train Loss: 0.0395 | Val Loss: 0.0452
Epoch 801/1000 | Train Loss: 0.0259 | Val Loss: 0.0395


[I 2025-10-25 15:36:34,636] Trial 14 finished with value: 0.048260601711895224 and parameters: {'n_layers': 2, 'n_units_l0': 111, 'n_units_l1': 97, 'lr': 0.021666597515559036, 'batch_size': 8, 'dropout_l0': 0.40184626470315665, 'dropout_l1': 0.17764493522965882, 'weight_decay': 0.0003555273994791307, 'optimizer': 'sgd', 'stopping_patience': 199, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch 901/1000 | Train Loss: 0.0268 | Val Loss: 0.0407
Epoch   1/1000 | Train Loss: 0.9372 | Val Loss: 0.6062
Epoch 101/1000 | Train Loss: 0.0761 | Val Loss: 0.0291
Epoch 201/1000 | Train Loss: 0.0688 | Val Loss: 0.0407
Epoch 301/1000 | Train Loss: 0.0595 | Val Loss: 0.0229
Epoch 401/1000 | Train Loss: 0.1006 | Val Loss: 0.0363
Epoch 501/1000 | Train Loss: 0.0801 | Val Loss: 0.0437


[I 2025-10-25 15:36:36,250] Trial 15 finished with value: 0.04879264300678085 and parameters: {'n_layers': 3, 'n_units_l0': 83, 'n_units_l1': 59, 'n_units_l2': 95, 'lr': 0.09069674271591571, 'batch_size': 8, 'dropout_l0': 0.4972210352130826, 'dropout_l1': 0.30515110711633076, 'dropout_l2': 0.29397310325487286, 'weight_decay': 0.005056996130137528, 'optimizer': 'sgd', 'stopping_patience': 270, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Early stopping triggered.
Epoch   1/1000 | Train Loss: 0.8248 | Val Loss: 0.5713
Epoch 101/1000 | Train Loss: 0.0641 | Val Loss: 0.1128
Epoch 201/1000 | Train Loss: 0.0511 | Val Loss: 0.0774
Epoch 301/1000 | Train Loss: 0.0424 | Val Loss: 0.0698
Epoch 401/1000 | Train Loss: 0.0311 | Val Loss: 0.0644
Epoch 501/1000 | Train Loss: 0.0756 | Val Loss: 0.0317
Epoch 601/1000 | Train Loss: 0.0734 | Val Loss: 0.0313
Epoch 701/1000 | Train Loss: 0.0267 | Val Loss: 0.0294
Epoch 801/1000 | Train Loss: 0.0141 | Val Loss: 0.0262
Epoch 901/1000 | Train Loss: 0.0301 | Val Loss: 0.0335


[I 2025-10-25 15:36:38,335] Trial 16 finished with value: 0.030044328997724 and parameters: {'n_layers': 2, 'n_units_l0': 114, 'n_units_l1': 102, 'lr': 0.03149501144921614, 'batch_size': 8, 'dropout_l0': 0.4094008047745873, 'dropout_l1': 0.14637368883371288, 'weight_decay': 1.1542418955149678e-06, 'optimizer': 'sgd', 'stopping_patience': 256, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 1.1794 | Val Loss: 1.0859
Epoch 101/1000 | Train Loss: 1.0487 | Val Loss: 1.0321
Epoch 201/1000 | Train Loss: 0.9928 | Val Loss: 0.9898
Epoch 301/1000 | Train Loss: 0.9620 | Val Loss: 0.9400
Epoch 401/1000 | Train Loss: 0.9112 | Val Loss: 0.8790
Epoch 501/1000 | Train Loss: 0.9042 | Val Loss: 0.8064
Epoch 601/1000 | Train Loss: 0.8729 | Val Loss: 0.7319
Epoch 701/1000 | Train Loss: 0.7941 | Val Loss: 0.6586
Epoch 801/1000 | Train Loss: 0.7178 | Val Loss: 0.5923
Epoch 901/1000 | Train Loss: 0.7024 | Val Loss: 0.5400


[I 2025-10-25 15:36:40,075] Trial 17 finished with value: 0.4991736127306725 and parameters: {'n_layers': 3, 'n_units_l0': 82, 'n_units_l1': 65, 'n_units_l2': 102, 'lr': 0.000828702001462736, 'batch_size': 16, 'dropout_l0': 0.3017112406618301, 'dropout_l1': 0.3920589136903435, 'dropout_l2': 0.31702897179459183, 'weight_decay': 2.4392063138872768e-05, 'optimizer': 'sgd', 'stopping_patience': 350, 'activation_l0': 'leaky_relu', 'activation_l1': 'sigmoid', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 1.0115 | Val Loss: 0.7894
Epoch 101/1000 | Train Loss: 0.1034 | Val Loss: 0.1653
Epoch 201/1000 | Train Loss: 0.0677 | Val Loss: 0.1201
Epoch 301/1000 | Train Loss: 0.0548 | Val Loss: 0.0928
Epoch 401/1000 | Train Loss: 0.0690 | Val Loss: 0.0776
Epoch 501/1000 | Train Loss: 0.0573 | Val Loss: 0.0597
Epoch 601/1000 | Train Loss: 0.0337 | Val Loss: 0.0531


[I 2025-10-25 15:36:41,688] Trial 18 finished with value: 0.055708465877917134 and parameters: {'n_layers': 2, 'n_units_l0': 127, 'n_units_l1': 108, 'lr': 0.01605757243398944, 'batch_size': 8, 'dropout_l0': 0.43125082086952066, 'dropout_l1': 0.2656425938684959, 'weight_decay': 0.00041471397318899986, 'optimizer': 'sgd', 'stopping_patience': 135, 'activation_l0': 'relu', 'activation_l1': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch 701/1000 | Train Loss: 0.0498 | Val Loss: 0.0597
Early stopping triggered.
Epoch   1/1000 | Train Loss: 0.8676 | Val Loss: 0.6292
Epoch 101/1000 | Train Loss: 0.0861 | Val Loss: 0.1299
Epoch 201/1000 | Train Loss: 0.0536 | Val Loss: 0.0308
Epoch 301/1000 | Train Loss: 0.0572 | Val Loss: 0.0089
Epoch 401/1000 | Train Loss: 0.0167 | Val Loss: 0.0136
Epoch 501/1000 | Train Loss: 0.0086 | Val Loss: 0.0130
Epoch 601/1000 | Train Loss: 0.0980 | Val Loss: 0.0160
Epoch 701/1000 | Train Loss: 0.0102 | Val Loss: 0.0111
Epoch 801/1000 | Train Loss: 0.0059 | Val Loss: 0.0056
Epoch 901/1000 | Train Loss: 0.0065 | Val Loss: 0.0025


[I 2025-10-25 15:36:44,654] Trial 19 finished with value: 0.0021966952562533096 and parameters: {'n_layers': 3, 'n_units_l0': 109, 'n_units_l1': 81, 'n_units_l2': 95, 'lr': 0.04272489785678072, 'batch_size': 8, 'dropout_l0': 0.34227253698678295, 'dropout_l1': 0.3822688570803103, 'dropout_l2': 0.1681944955894049, 'weight_decay': 0.00016814791061088734, 'optimizer': 'sgd', 'stopping_patience': 228, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 0.8310 | Val Loss: 0.4805
Epoch 101/1000 | Train Loss: 0.2642 | Val Loss: 0.0945
Epoch 201/1000 | Train Loss: 0.1315 | Val Loss: 0.0432
Epoch 301/1000 | Train Loss: 0.1073 | Val Loss: 0.0322


[I 2025-10-25 15:36:45,143] Trial 20 finished with value: 0.0730311311567033 and parameters: {'n_layers': 2, 'n_units_l0': 10, 'n_units_l1': 82, 'lr': 0.09647622930666701, 'batch_size': 8, 'dropout_l0': 0.44878614621483737, 'dropout_l1': 0.13352891936846464, 'weight_decay': 0.0010084355906754964, 'optimizer': 'sgd', 'stopping_patience': 155, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Early stopping triggered.
Epoch   1/1000 | Train Loss: 1.0233 | Val Loss: 0.6409
Epoch 101/1000 | Train Loss: 0.0919 | Val Loss: 0.1174
Epoch 201/1000 | Train Loss: 0.0391 | Val Loss: 0.0674
Epoch 301/1000 | Train Loss: 0.0547 | Val Loss: 0.0326
Epoch 401/1000 | Train Loss: 0.0127 | Val Loss: 0.0283
Epoch 501/1000 | Train Loss: 0.0111 | Val Loss: 0.0176
Epoch 601/1000 | Train Loss: 0.0297 | Val Loss: 0.0176
Epoch 701/1000 | Train Loss: 0.0061 | Val Loss: 0.0069
Epoch 801/1000 | Train Loss: 0.0050 | Val Loss: 0.0017
Epoch 901/1000 | Train Loss: 0.0017 | Val Loss: 0.0031


[I 2025-10-25 15:36:48,200] Trial 21 finished with value: 0.005075973221527518 and parameters: {'n_layers': 3, 'n_units_l0': 110, 'n_units_l1': 82, 'n_units_l2': 102, 'lr': 0.04291024822075367, 'batch_size': 8, 'dropout_l0': 0.36151932117883856, 'dropout_l1': 0.39006999355882993, 'dropout_l2': 0.1706639227391679, 'weight_decay': 0.00015598597730449732, 'optimizer': 'sgd', 'stopping_patience': 237, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 1.0856 | Val Loss: 0.9862
Epoch 101/1000 | Train Loss: 0.9285 | Val Loss: 0.8205
Epoch 201/1000 | Train Loss: 0.7900 | Val Loss: 0.7086
Epoch 301/1000 | Train Loss: 0.6891 | Val Loss: 0.6324
Epoch 401/1000 | Train Loss: 0.6153 | Val Loss: 0.5778
Epoch 501/1000 | Train Loss: 0.5766 | Val Loss: 0.5380
Epoch 601/1000 | Train Loss: 0.5505 | Val Loss: 0.5079
Epoch 701/1000 | Train Loss: 0.5085 | Val Loss: 0.4846
Epoch 801/1000 | Train Loss: 0.4923 | Val Loss: 0.4663
Epoch 901/1000 | Train Loss: 0.4508 | Val Loss: 0.4512


[I 2025-10-25 15:36:51,592] Trial 22 finished with value: 0.43900400716715354 and parameters: {'n_layers': 3, 'n_units_l0': 116, 'n_units_l1': 112, 'n_units_l2': 112, 'lr': 0.00010906517400063499, 'batch_size': 8, 'dropout_l0': 0.29677888403716923, 'dropout_l1': 0.3462244140469623, 'dropout_l2': 0.14796423967800904, 'weight_decay': 2.5181051790678464e-05, 'optimizer': 'sgd', 'stopping_patience': 310, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 0.8452 | Val Loss: 0.6492
Epoch 101/1000 | Train Loss: 0.0920 | Val Loss: 0.0873
Epoch 201/1000 | Train Loss: 0.0429 | Val Loss: 0.0544
Epoch 301/1000 | Train Loss: 0.0290 | Val Loss: 0.0217
Epoch 401/1000 | Train Loss: 0.0204 | Val Loss: 0.0087
Epoch 501/1000 | Train Loss: 0.0170 | Val Loss: 0.0131
Epoch 601/1000 | Train Loss: 0.0128 | Val Loss: 0.0487
Epoch 701/1000 | Train Loss: 0.0249 | Val Loss: 0.0100
Epoch 801/1000 | Train Loss: 0.0080 | Val Loss: 0.0034
Epoch 901/1000 | Train Loss: 0.0053 | Val Loss: 0.0074


[I 2025-10-25 15:36:54,484] Trial 23 finished with value: 0.0029323463114581615 and parameters: {'n_layers': 3, 'n_units_l0': 85, 'n_units_l1': 91, 'n_units_l2': 81, 'lr': 0.045599546745334694, 'batch_size': 8, 'dropout_l0': 0.18084007574455735, 'dropout_l1': 0.42468004740346443, 'dropout_l2': 0.37794119193144793, 'weight_decay': 0.0001721520812857939, 'optimizer': 'sgd', 'stopping_patience': 232, 'activation_l0': 'leaky_relu', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 1.1348 | Val Loss: 1.0806
Epoch 101/1000 | Train Loss: 0.3338 | Val Loss: 0.2749
Epoch 201/1000 | Train Loss: 0.1737 | Val Loss: 0.1479
Epoch 301/1000 | Train Loss: 0.1896 | Val Loss: 0.0749
Epoch 401/1000 | Train Loss: 0.0726 | Val Loss: 0.0507
Epoch 501/1000 | Train Loss: 0.1113 | Val Loss: 0.0399
Epoch 601/1000 | Train Loss: 0.0762 | Val Loss: 0.0366
Epoch 701/1000 | Train Loss: 0.0572 | Val Loss: 0.0329
Epoch 801/1000 | Train Loss: 0.0479 | Val Loss: 0.0307
Epoch 901/1000 | Train Loss: 0.1134 | Val Loss: 0.0253


[I 2025-10-25 15:36:56,325] Trial 24 finished with value: 0.02390661194027548 and parameters: {'n_layers': 3, 'n_units_l0': 118, 'n_units_l1': 50, 'n_units_l2': 117, 'lr': 0.014267757545731318, 'batch_size': 16, 'dropout_l0': 0.3634887444916421, 'dropout_l1': 0.2754381515156892, 'dropout_l2': 0.1991684657711395, 'weight_decay': 0.000414576039734351, 'optimizer': 'sgd', 'stopping_patience': 214, 'activation_l0': 'leaky_relu', 'activation_l1': 'relu', 'activation_l2': 'sigmoid'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 1.2007 | Val Loss: 1.0469
Epoch 101/1000 | Train Loss: 0.0948 | Val Loss: 0.0189
Epoch 201/1000 | Train Loss: 0.0571 | Val Loss: 0.0164
Epoch 301/1000 | Train Loss: 0.0386 | Val Loss: 0.0171
Epoch 401/1000 | Train Loss: 0.0384 | Val Loss: 0.0103


[I 2025-10-25 15:36:57,939] Trial 25 finished with value: 0.032261015494640215 and parameters: {'n_layers': 3, 'n_units_l0': 104, 'n_units_l1': 111, 'n_units_l2': 81, 'lr': 0.09898982396237456, 'batch_size': 8, 'dropout_l0': 0.4964064623437656, 'dropout_l1': 0.33001855604110397, 'dropout_l2': 0.055715020275795196, 'weight_decay': 0.00011976749459109336, 'optimizer': 'sgd', 'stopping_patience': 163, 'activation_l0': 'leaky_relu', 'activation_l1': 'sigmoid', 'activation_l2': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Early stopping triggered.
Epoch   1/1000 | Train Loss: 0.9428 | Val Loss: 0.6657
Epoch 101/1000 | Train Loss: 0.0727 | Val Loss: 0.1641
Epoch 201/1000 | Train Loss: 0.0582 | Val Loss: 0.2180
Epoch 301/1000 | Train Loss: 0.0543 | Val Loss: 0.1104
Epoch 401/1000 | Train Loss: 0.0774 | Val Loss: 0.2120
Epoch 501/1000 | Train Loss: 0.0396 | Val Loss: 0.2548


[I 2025-10-25 15:37:00,015] Trial 26 finished with value: 0.244814990202367 and parameters: {'n_layers': 3, 'n_units_l0': 128, 'n_units_l1': 74, 'n_units_l2': 87, 'lr': 0.04148041290831846, 'batch_size': 8, 'dropout_l0': 0.4316909882227102, 'dropout_l1': 0.22084139078805784, 'dropout_l2': 0.3769361015524776, 'weight_decay': 2.948691555668425e-05, 'optimizer': 'momentum', 'stopping_patience': 318, 'activation_l0': 'tanh', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch 601/1000 | Train Loss: 0.0415 | Val Loss: 0.1843
Early stopping triggered.
Epoch   1/1000 | Train Loss: 1.0535 | Val Loss: 0.8241
Epoch 101/1000 | Train Loss: 0.1463 | Val Loss: 0.1733
Epoch 201/1000 | Train Loss: 0.1123 | Val Loss: 0.1270
Epoch 301/1000 | Train Loss: 0.1131 | Val Loss: 0.1048
Epoch 401/1000 | Train Loss: 0.1036 | Val Loss: 0.0997
Epoch 501/1000 | Train Loss: 0.0836 | Val Loss: 0.0944
Epoch 601/1000 | Train Loss: 0.0870 | Val Loss: 0.0790
Epoch 701/1000 | Train Loss: 0.0636 | Val Loss: 0.0756
Epoch 801/1000 | Train Loss: 0.0729 | Val Loss: 0.0703


[I 2025-10-25 15:37:02,039] Trial 27 finished with value: 0.06566249163056191 and parameters: {'n_layers': 2, 'n_units_l0': 92, 'n_units_l1': 114, 'lr': 0.010964500635359873, 'batch_size': 8, 'dropout_l0': 0.37966578855602645, 'dropout_l1': 0.4290554693505719, 'weight_decay': 0.006908539640298301, 'optimizer': 'sgd', 'stopping_patience': 120, 'activation_l0': 'relu', 'activation_l1': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch 901/1000 | Train Loss: 0.0903 | Val Loss: 0.0699
Epoch   1/1000 | Train Loss: 1.1086 | Val Loss: 0.9710
Epoch 101/1000 | Train Loss: 0.3844 | Val Loss: 0.1770
Epoch 201/1000 | Train Loss: 0.3321 | Val Loss: 0.1495
Epoch 301/1000 | Train Loss: 0.3022 | Val Loss: 0.1498
Epoch 401/1000 | Train Loss: 0.2096 | Val Loss: 0.0835
Epoch 501/1000 | Train Loss: 0.2059 | Val Loss: 0.0781
Epoch 601/1000 | Train Loss: 0.1733 | Val Loss: 0.0774
Epoch 701/1000 | Train Loss: 0.2235 | Val Loss: 0.0618
Epoch 801/1000 | Train Loss: 0.1819 | Val Loss: 0.0609
Epoch 901/1000 | Train Loss: 0.1483 | Val Loss: 0.0580


[I 2025-10-25 15:37:05,201] Trial 28 finished with value: 0.049998391487115434 and parameters: {'n_layers': 3, 'n_units_l0': 75, 'n_units_l1': 90, 'n_units_l2': 114, 'lr': 0.028215713227873487, 'batch_size': 8, 'dropout_l0': 0.32370263488861745, 'dropout_l1': 0.36154424711516175, 'dropout_l2': 0.3142262371342558, 'weight_decay': 0.0027060904393533614, 'optimizer': 'sgd', 'stopping_patience': 378, 'activation_l0': 'sigmoid', 'activation_l1': 'tanh', 'activation_l2': 'leaky_relu'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch   1/1000 | Train Loss: 1.0768 | Val Loss: 0.7698
Epoch 101/1000 | Train Loss: 0.0869 | Val Loss: 0.1478
Epoch 201/1000 | Train Loss: 0.0702 | Val Loss: 0.1308
Epoch 301/1000 | Train Loss: 0.0710 | Val Loss: 0.1283
Epoch 401/1000 | Train Loss: 0.0632 | Val Loss: 0.1171
Epoch 501/1000 | Train Loss: 0.0535 | Val Loss: 0.1146
Epoch 601/1000 | Train Loss: 0.0595 | Val Loss: 0.1079
Epoch 701/1000 | Train Loss: 0.0619 | Val Loss: 0.1068
Epoch 801/1000 | Train Loss: 0.0484 | Val Loss: 0.1033


[I 2025-10-25 15:37:05,896] Trial 29 finished with value: 0.10132341998757717 and parameters: {'n_layers': 1, 'n_units_l0': 106, 'lr': 0.05619399082307371, 'batch_size': 16, 'dropout_l0': 0.18854567921891188, 'weight_decay': 0.000767094921314456, 'optimizer': 'momentum', 'stopping_patience': 252, 'activation_l0': 'tanh'}. Best is trial 10 with value: 0.0008098299657081411.


Epoch 901/1000 | Train Loss: 0.0563 | Val Loss: 0.1007
Best trial:
  Value: 0.0008
  Params:
    n_layers: 3
    n_units_l0: 128
    n_units_l1: 125
    n_units_l2: 128
    lr: 0.08872966878731414
    batch_size: 8
    dropout_l0: 0.4719034296249297
    dropout_l1: 0.30899072246268755
    dropout_l2: 0.4933546876716215
    weight_decay: 5.1478572239525056e-05
    optimizer: sgd
    stopping_patience: 211
    activation_l0: leaky_relu
    activation_l1: tanh
    activation_l2: tanh
Epoch   1/1000 | Train Loss: 0.6496 | Val Loss: 0.4159
Epoch 101/1000 | Train Loss: 0.0746 | Val Loss: 0.0124
Epoch 201/1000 | Train Loss: 0.0598 | Val Loss: 0.0197
Epoch 301/1000 | Train Loss: 0.0113 | Val Loss: 0.0044
Epoch 401/1000 | Train Loss: 0.0380 | Val Loss: 0.0024
Epoch 501/1000 | Train Loss: 0.0196 | Val Loss: 0.0013
Epoch 601/1000 | Train Loss: 0.0070 | Val Loss: 0.0038
Epoch 701/1000 | Train Loss: 0.0079 | Val Loss: 0.0007
Epoch 801/1000 | Train Loss: 0.0032 | Val Loss: 0.0008
Epoch 901/1000 | Tr