In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Preparation

def load_data(file_path):
    df = pd.read_excel(file_path)

    # Filter rows where 'Output' length is <=500
    df = df[df['output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)

    # Randomly select 250,000 rows (if available)
    if count_filtered > 10000:
        df = df.sample(n=10000, random_state=42)

    inputs = df['input'].tolist()
    outputs = df['output'].tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break
    return best_val_loss  # Return the best validation loss from this training run

# Global variables to track best model across all trials
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Hyperparameter Optimization with Optuna
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [128, 256, 512]),
        "num_heads": trial.suggest_categorical("num_heads", [2, 4, 8, 16]),
        "num_layers": trial.suggest_categorical("num_layers", [6, 8, 10, 12]),
        "d_ff": trial.suggest_categorical("d_ff", [256, 512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss for this configuration
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_vig_key_100.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('/content/Full_training_vigenere_100.xlsx')
    vocab = Vocabulary()
    max_length = 256  # Adjusted for longer sentences

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100)  # 20 trials or 1 hour

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('/content/drive/MyDrive/best_vig_key_100.pth'))

    # Evaluate on full datasets
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher:xibu", "what")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cuda


[I 2025-05-10 08:09:56,649] A new study created in memory with name: no-name-2754dbb6-d626-49a9-a2d4-18b5ff96e2e3
Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.57it/s]


Epoch 1/10:
Train Loss: 2.3669 | Val Loss: 2.1154


Training: 100%|██████████| 250/250 [01:17<00:00,  3.25it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.42it/s]


Epoch 2/10:
Train Loss: 2.0333 | Val Loss: 1.8936


Training: 100%|██████████| 250/250 [01:17<00:00,  3.22it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.46it/s]


Epoch 3/10:
Train Loss: 1.8845 | Val Loss: 1.7908


Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.47it/s]


Epoch 4/10:
Train Loss: 1.7757 | Val Loss: 1.7112


Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.46it/s]


Epoch 5/10:
Train Loss: 1.6924 | Val Loss: 1.6816


Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.46it/s]


Epoch 6/10:
Train Loss: 1.6148 | Val Loss: 1.6518


Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.43it/s]


Epoch 7/10:
Train Loss: 1.5435 | Val Loss: 1.5542


Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.46it/s]


Epoch 8/10:
Train Loss: 1.4875 | Val Loss: 1.4919


Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.44it/s]


Epoch 9/10:
Train Loss: 1.4373 | Val Loss: 1.5293


Training: 100%|██████████| 250/250 [01:17<00:00,  3.23it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.46it/s]


Epoch 10/10:
Train Loss: 1.3904 | Val Loss: 1.4845


[I 2025-05-10 08:24:12,658] Trial 0 finished with value: 1.484502767759656 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.2966364633060481, 'learning_rate': 0.00020176873820183198, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


New best model found! Val Loss: 1.4845
Config: {'d_model': 256, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.2966364633060481, 'learning_rate': 0.00020176873820183198, 'batch_size': 32}


Training: 100%|██████████| 250/250 [02:19<00:00,  1.80it/s]
Evaluating: 100%|██████████| 63/63 [00:12<00:00,  5.12it/s]


Epoch 1/10:
Train Loss: 3.0377 | Val Loss: 2.9815


Training: 100%|██████████| 250/250 [02:19<00:00,  1.80it/s]
Evaluating: 100%|██████████| 63/63 [00:12<00:00,  5.12it/s]


Epoch 2/10:
Train Loss: 2.9833 | Val Loss: 2.9731


Training: 100%|██████████| 250/250 [02:19<00:00,  1.80it/s]
Evaluating: 100%|██████████| 63/63 [00:12<00:00,  5.13it/s]


Epoch 3/10:
Train Loss: 2.9793 | Val Loss: 2.9774


Training: 100%|██████████| 250/250 [02:19<00:00,  1.80it/s]
Evaluating: 100%|██████████| 63/63 [00:12<00:00,  5.12it/s]


Epoch 4/10:
Train Loss: 2.9780 | Val Loss: 2.9751


Training: 100%|██████████| 250/250 [02:19<00:00,  1.80it/s]
Evaluating: 100%|██████████| 63/63 [00:12<00:00,  5.11it/s]
[I 2025-05-10 08:36:50,368] Trial 1 finished with value: 2.973124379203433 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.20462664442333417, 'learning_rate': 0.0025562692606405364, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 5/10:
Train Loss: 2.9780 | Val Loss: 2.9773
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.10it/s]


Epoch 1/10:
Train Loss: 3.0366 | Val Loss: 2.9844


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.13it/s]


Epoch 2/10:
Train Loss: 2.9821 | Val Loss: 2.9807


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.12it/s]


Epoch 3/10:
Train Loss: 2.9797 | Val Loss: 2.9817


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.12it/s]


Epoch 4/10:
Train Loss: 2.9788 | Val Loss: 2.9864


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.10it/s]
[I 2025-05-10 08:45:02,493] Trial 2 finished with value: 2.9807134658571273 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.29871733644054077, 'learning_rate': 0.002888508309136799, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 5/10:
Train Loss: 2.9776 | Val Loss: 2.9839
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.09it/s]


Epoch 1/10:
Train Loss: 3.0421 | Val Loss: 2.9805


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.09it/s]


Epoch 2/10:
Train Loss: 2.9814 | Val Loss: 2.9793


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.10it/s]


Epoch 3/10:
Train Loss: 2.9790 | Val Loss: 2.9759


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.10it/s]


Epoch 4/10:
Train Loss: 2.9782 | Val Loss: 2.9739


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.09it/s]


Epoch 5/10:
Train Loss: 2.9775 | Val Loss: 2.9823


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.10it/s]


Epoch 6/10:
Train Loss: 2.9775 | Val Loss: 2.9745


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.09it/s]


Epoch 7/10:
Train Loss: 2.9771 | Val Loss: 2.9719


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.10it/s]


Epoch 8/10:
Train Loss: 2.9774 | Val Loss: 2.9803


Training: 100%|██████████| 250/250 [01:11<00:00,  3.48it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.09it/s]


Epoch 9/10:
Train Loss: 2.9770 | Val Loss: 2.9778


Training: 100%|██████████| 250/250 [01:11<00:00,  3.49it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.08it/s]
[I 2025-05-10 08:58:03,063] Trial 3 finished with value: 2.9719270978655135 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.1852403499016168, 'learning_rate': 0.003485986779150211, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 10/10:
Train Loss: 2.9769 | Val Loss: 2.9765
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:26<00:00,  9.39it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.23it/s]


Epoch 1/10:
Train Loss: 2.3962 | Val Loss: 2.1132


Training: 100%|██████████| 250/250 [00:26<00:00,  9.39it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.12it/s]


Epoch 2/10:
Train Loss: 2.0750 | Val Loss: 1.9553


Training: 100%|██████████| 250/250 [00:26<00:00,  9.38it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.19it/s]


Epoch 3/10:
Train Loss: 1.9419 | Val Loss: 1.8316


Training: 100%|██████████| 250/250 [00:26<00:00,  9.39it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.18it/s]


Epoch 4/10:
Train Loss: 1.8495 | Val Loss: 1.7778


Training: 100%|██████████| 250/250 [00:26<00:00,  9.38it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 24.95it/s]


Epoch 5/10:
Train Loss: 1.7804 | Val Loss: 1.9126


Training: 100%|██████████| 250/250 [00:26<00:00,  9.38it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.21it/s]


Epoch 6/10:
Train Loss: 1.7225 | Val Loss: 2.6794


Training: 100%|██████████| 250/250 [00:26<00:00,  9.39it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.19it/s]
[I 2025-05-10 09:01:27,164] Trial 4 finished with value: 1.7778317417417253 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.32800557407251263, 'learning_rate': 0.0009817991948098657, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 7/10:
Train Loss: 1.6748 | Val Loss: 2.9178
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:14<00:00,  1.85it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.31it/s]


Epoch 1/10:
Train Loss: 3.0290 | Val Loss: 2.9836


Training: 100%|██████████| 250/250 [02:14<00:00,  1.85it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.31it/s]


Epoch 2/10:
Train Loss: 2.9768 | Val Loss: 3.3479


Training: 100%|██████████| 250/250 [02:14<00:00,  1.85it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.31it/s]


Epoch 3/10:
Train Loss: 2.9559 | Val Loss: 3.5862


Training: 100%|██████████| 250/250 [02:14<00:00,  1.85it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.31it/s]
[I 2025-05-10 09:11:14,644] Trial 5 finished with value: 2.9836372685810875 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.3058452252894319, 'learning_rate': 0.0009740196171500149, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 4/10:
Train Loss: 2.9474 | Val Loss: 3.4749
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.09it/s]


Epoch 1/10:
Train Loss: 3.0285 | Val Loss: 2.9806


Training: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.10it/s]


Epoch 2/10:
Train Loss: 2.9842 | Val Loss: 2.9761


Training: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.08it/s]


Epoch 3/10:
Train Loss: 2.9766 | Val Loss: 3.2609


Training: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.11it/s]


Epoch 4/10:
Train Loss: 2.9750 | Val Loss: 3.1172


Training: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.09it/s]
[I 2025-05-10 09:19:25,265] Trial 6 finished with value: 2.9760981438651917 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1272357614915907, 'learning_rate': 0.0016821865235081817, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 5/10:
Train Loss: 2.9742 | Val Loss: 3.0248
Early stopping triggered!


Training: 100%|██████████| 250/250 [03:42<00:00,  1.12it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.09it/s]


Epoch 1/10:
Train Loss: 3.0379 | Val Loss: 3.0105


Training: 100%|██████████| 250/250 [03:42<00:00,  1.13it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.09it/s]


Epoch 2/10:
Train Loss: 2.9827 | Val Loss: 2.9880


Training: 100%|██████████| 250/250 [03:42<00:00,  1.12it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s]


Epoch 3/10:
Train Loss: 2.9795 | Val Loss: 2.9791


Training: 100%|██████████| 250/250 [03:42<00:00,  1.12it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s]


Epoch 4/10:
Train Loss: 2.9783 | Val Loss: 2.9767


Training: 100%|██████████| 250/250 [03:42<00:00,  1.13it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s]


Epoch 5/10:
Train Loss: 2.9775 | Val Loss: 2.9734


Training: 100%|██████████| 250/250 [03:42<00:00,  1.12it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s]


Epoch 6/10:
Train Loss: 2.9773 | Val Loss: 2.9714


Training: 100%|██████████| 250/250 [03:42<00:00,  1.12it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s]


Epoch 7/10:
Train Loss: 2.9768 | Val Loss: 2.9717


Training: 100%|██████████| 250/250 [03:42<00:00,  1.12it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.09it/s]


Epoch 8/10:
Train Loss: 2.9766 | Val Loss: 2.9714


Training: 100%|██████████| 250/250 [03:42<00:00,  1.13it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s]


Epoch 9/10:
Train Loss: 2.9766 | Val Loss: 2.9726


Training: 100%|██████████| 250/250 [03:42<00:00,  1.13it/s]
Evaluating: 100%|██████████| 63/63 [00:20<00:00,  3.09it/s]
[I 2025-05-10 09:59:52,399] Trial 7 finished with value: 2.9702019010271346 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.3626828900386475, 'learning_rate': 0.009058950183690925, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 10/10:
Train Loss: 2.9749 | Val Loss: 2.9702


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.92it/s]


Epoch 1/10:
Train Loss: 3.0113 | Val Loss: 2.9843


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.91it/s]


Epoch 2/10:
Train Loss: 2.9799 | Val Loss: 3.3799


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.92it/s]


Epoch 3/10:
Train Loss: 2.9731 | Val Loss: 3.5098


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.91it/s]
[I 2025-05-10 10:12:07,317] Trial 8 finished with value: 2.984304507573446 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.19329779092639912, 'learning_rate': 0.005860428675820683, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 4/10:
Train Loss: 2.9684 | Val Loss: 3.7964
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:18<00:00,  3.20it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.34it/s]


Epoch 1/10:
Train Loss: 3.0006 | Val Loss: 3.2914


Training: 100%|██████████| 250/250 [01:18<00:00,  3.20it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.30it/s]


Epoch 2/10:
Train Loss: 2.9259 | Val Loss: 3.6831


Training: 100%|██████████| 250/250 [01:18<00:00,  3.20it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.33it/s]


Epoch 3/10:
Train Loss: 2.9117 | Val Loss: 3.6537


Training: 100%|██████████| 250/250 [01:18<00:00,  3.20it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.32it/s]
[I 2025-05-10 10:17:50,393] Trial 9 finished with value: 3.291425523303804 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.2674087143113475, 'learning_rate': 0.0025505874230878508, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 4/10:
Train Loss: 2.9031 | Val Loss: 3.6522
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.24it/s]


Epoch 1/10:
Train Loss: 2.4838 | Val Loss: 2.2541


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.20it/s]


Epoch 2/10:
Train Loss: 2.1470 | Val Loss: 2.0353


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 3/10:
Train Loss: 2.0233 | Val Loss: 1.9884


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.20it/s]


Epoch 4/10:
Train Loss: 1.9249 | Val Loss: 1.8751


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.20it/s]


Epoch 5/10:
Train Loss: 1.8420 | Val Loss: 1.7959


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]


Epoch 6/10:
Train Loss: 1.7801 | Val Loss: 1.7602


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.24it/s]


Epoch 7/10:
Train Loss: 1.7175 | Val Loss: 1.7282


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.19it/s]


Epoch 8/10:
Train Loss: 1.6681 | Val Loss: 1.6792


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.24it/s]


Epoch 9/10:
Train Loss: 1.6270 | Val Loss: 1.6297


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.24it/s]
[I 2025-05-10 10:30:46,023] Trial 10 finished with value: 1.5771003450666154 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.3841265742276746, 'learning_rate': 0.00015249578275600804, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 10/10:
Train Loss: 1.5842 | Val Loss: 1.5771


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.23it/s]


Epoch 1/10:
Train Loss: 2.5056 | Val Loss: 2.2305


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.18it/s]


Epoch 2/10:
Train Loss: 2.1620 | Val Loss: 2.0441


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 3/10:
Train Loss: 2.0379 | Val Loss: 1.9586


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]


Epoch 4/10:
Train Loss: 1.9416 | Val Loss: 1.8705


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.19it/s]


Epoch 5/10:
Train Loss: 1.8627 | Val Loss: 1.8564


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 6/10:
Train Loss: 1.7985 | Val Loss: 1.7726


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 7/10:
Train Loss: 1.7438 | Val Loss: 1.7921


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.20it/s]


Epoch 8/10:
Train Loss: 1.6935 | Val Loss: 1.6720


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 9/10:
Train Loss: 1.6438 | Val Loss: 1.6551


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.20it/s]
[I 2025-05-10 10:43:42,006] Trial 11 finished with value: 1.626204114111643 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.389654932237729, 'learning_rate': 0.0001389216907519464, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 10/10:
Train Loss: 1.6027 | Val Loss: 1.6262


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.20it/s]


Epoch 1/10:
Train Loss: 2.5511 | Val Loss: 2.2284


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.13it/s]


Epoch 2/10:
Train Loss: 2.1943 | Val Loss: 2.1728


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.16it/s]


Epoch 3/10:
Train Loss: 2.0891 | Val Loss: 1.9907


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.15it/s]


Epoch 4/10:
Train Loss: 1.9987 | Val Loss: 1.9947


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 5/10:
Train Loss: 1.9194 | Val Loss: 1.8940


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 6/10:
Train Loss: 1.8582 | Val Loss: 1.8352


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]


Epoch 7/10:
Train Loss: 1.8049 | Val Loss: 1.8130


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.17it/s]


Epoch 8/10:
Train Loss: 1.7575 | Val Loss: 1.7585


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 9/10:
Train Loss: 1.7171 | Val Loss: 1.7932


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.19it/s]
[I 2025-05-10 10:56:38,442] Trial 12 finished with value: 1.7199373093862382 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.3935184805520823, 'learning_rate': 0.000112750159580207, 'batch_size': 32}. Best is trial 0 with value: 1.484502767759656.


Epoch 10/10:
Train Loss: 1.6730 | Val Loss: 1.7199


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.18it/s]


Epoch 1/10:
Train Loss: 2.3249 | Val Loss: 2.0260


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.23it/s]


Epoch 2/10:
Train Loss: 1.9578 | Val Loss: 1.8353


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]


Epoch 3/10:
Train Loss: 1.7982 | Val Loss: 1.7028


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.17it/s]


Epoch 4/10:
Train Loss: 1.6950 | Val Loss: 1.6587


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]


Epoch 5/10:
Train Loss: 1.6065 | Val Loss: 1.6088


Training: 100%|██████████| 250/250 [01:10<00:00,  3.53it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]


Epoch 6/10:
Train Loss: 1.5253 | Val Loss: 1.4916


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.23it/s]


Epoch 7/10:
Train Loss: 1.4528 | Val Loss: 1.4356


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.21it/s]


Epoch 8/10:
Train Loss: 1.3935 | Val Loss: 1.4190


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]


Epoch 9/10:
Train Loss: 1.3371 | Val Loss: 1.3519


Training: 100%|██████████| 250/250 [01:10<00:00,  3.54it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.22it/s]
[I 2025-05-10 11:09:34,231] Trial 13 finished with value: 1.3519311045843458 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.25037348090760403, 'learning_rate': 0.00031019581158735974, 'batch_size': 32}. Best is trial 13 with value: 1.3519311045843458.


Epoch 10/10:
Train Loss: 1.2902 | Val Loss: 1.3751
New best model found! Val Loss: 1.3519
Config: {'d_model': 256, 'num_heads': 8, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.25037348090760403, 'learning_rate': 0.00031019581158735974, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:51<00:00,  4.89it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.43it/s]


Epoch 1/10:
Train Loss: 2.3595 | Val Loss: 2.0519


Training: 100%|██████████| 250/250 [00:51<00:00,  4.90it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.34it/s]


Epoch 2/10:
Train Loss: 1.9613 | Val Loss: 1.8280


Training: 100%|██████████| 250/250 [00:51<00:00,  4.90it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.43it/s]


Epoch 3/10:
Train Loss: 1.8123 | Val Loss: 1.7161


Training: 100%|██████████| 250/250 [00:51<00:00,  4.90it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.43it/s]


Epoch 4/10:
Train Loss: 1.7110 | Val Loss: 1.6377


Training: 100%|██████████| 250/250 [00:51<00:00,  4.90it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.43it/s]


Epoch 5/10:
Train Loss: 1.6259 | Val Loss: 1.5786


Training: 100%|██████████| 250/250 [00:50<00:00,  4.90it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.40it/s]


Epoch 6/10:
Train Loss: 1.5568 | Val Loss: 1.5257


Training: 100%|██████████| 250/250 [00:51<00:00,  4.89it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.39it/s]


Epoch 7/10:
Train Loss: 1.4966 | Val Loss: 1.4741


Training: 100%|██████████| 250/250 [00:50<00:00,  4.91it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.45it/s]


Epoch 8/10:
Train Loss: 1.4436 | Val Loss: 1.4389


Training: 100%|██████████| 250/250 [00:51<00:00,  4.90it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.41it/s]


Epoch 9/10:
Train Loss: 1.3968 | Val Loss: 1.4107


Training: 100%|██████████| 250/250 [00:50<00:00,  4.91it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.49it/s]
[I 2025-05-10 11:18:51,781] Trial 14 finished with value: 1.3811682008561634 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.24734416029990502, 'learning_rate': 0.0003627512066853219, 'batch_size': 32}. Best is trial 13 with value: 1.3519311045843458.


Epoch 10/10:
Train Loss: 1.3515 | Val Loss: 1.3812


Training: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.41it/s]


Epoch 1/10:
Train Loss: 2.2809 | Val Loss: 2.0420


Training: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.52it/s]


Epoch 2/10:
Train Loss: 1.9147 | Val Loss: 1.8156


Training: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.53it/s]


Epoch 3/10:
Train Loss: 1.7520 | Val Loss: 1.7097


Training: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.44it/s]


Epoch 4/10:
Train Loss: 1.6306 | Val Loss: 1.6046


Training: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.51it/s]


Epoch 5/10:
Train Loss: 1.5396 | Val Loss: 1.5033


Training: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.54it/s]


Epoch 6/10:
Train Loss: 1.4641 | Val Loss: 1.4146


Training: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.37it/s]


Epoch 7/10:
Train Loss: 1.3985 | Val Loss: 1.3957


Training: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.51it/s]


Epoch 8/10:
Train Loss: 1.3378 | Val Loss: 1.3225


Training: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.51it/s]


Epoch 9/10:
Train Loss: 1.2859 | Val Loss: 1.2609


Training: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.50it/s]
[I 2025-05-10 11:25:53,749] Trial 15 finished with value: 1.232889525474064 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.2417464424398553, 'learning_rate': 0.0003746413622623214, 'batch_size': 32}. Best is trial 15 with value: 1.232889525474064.


Epoch 10/10:
Train Loss: 1.2392 | Val Loss: 1.2329
New best model found! Val Loss: 1.2329
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.2417464424398553, 'learning_rate': 0.0003746413622623214, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:32<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.31it/s]


Epoch 1/10:
Train Loss: 2.2750 | Val Loss: 1.9689


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.31it/s]


Epoch 2/10:
Train Loss: 1.9067 | Val Loss: 1.7781


Training: 100%|██████████| 250/250 [00:32<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.36it/s]


Epoch 3/10:
Train Loss: 1.7394 | Val Loss: 1.6921


Training: 100%|██████████| 250/250 [00:32<00:00,  7.59it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.37it/s]


Epoch 4/10:
Train Loss: 1.6220 | Val Loss: 1.5596


Training: 100%|██████████| 250/250 [00:32<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.33it/s]


Epoch 5/10:
Train Loss: 1.5303 | Val Loss: 1.5250


Training: 100%|██████████| 250/250 [00:32<00:00,  7.59it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.36it/s]


Epoch 6/10:
Train Loss: 1.4558 | Val Loss: 1.3996


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.27it/s]


Epoch 7/10:
Train Loss: 1.3856 | Val Loss: 1.3733


Training: 100%|██████████| 250/250 [00:33<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.27it/s]


Epoch 8/10:
Train Loss: 1.3303 | Val Loss: 1.2804


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.30it/s]


Epoch 9/10:
Train Loss: 1.2724 | Val Loss: 1.2635


Training: 100%|██████████| 250/250 [00:32<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.33it/s]
[I 2025-05-10 11:31:54,910] Trial 16 finished with value: 1.1857713025713723 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.23664261006941711, 'learning_rate': 0.0004415813104175285, 'batch_size': 32}. Best is trial 16 with value: 1.1857713025713723.


Epoch 10/10:
Train Loss: 1.2229 | Val Loss: 1.1858
New best model found! Val Loss: 1.1858
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.23664261006941711, 'learning_rate': 0.0004415813104175285, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.57it/s]


Epoch 1/10:
Train Loss: 2.3060 | Val Loss: 2.0523


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.53it/s]


Epoch 2/10:
Train Loss: 1.9213 | Val Loss: 1.7956


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.61it/s]


Epoch 3/10:
Train Loss: 1.7338 | Val Loss: 1.6216


Training: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.64it/s]


Epoch 4/10:
Train Loss: 1.6107 | Val Loss: 1.5365


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.62it/s]


Epoch 5/10:
Train Loss: 1.5155 | Val Loss: 1.4545


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.57it/s]


Epoch 6/10:
Train Loss: 1.4336 | Val Loss: 1.4317


Training: 100%|██████████| 250/250 [00:23<00:00, 10.71it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.51it/s]


Epoch 7/10:
Train Loss: 1.3679 | Val Loss: 1.3261


Training: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.40it/s]


Epoch 8/10:
Train Loss: 1.3051 | Val Loss: 1.2823


Training: 100%|██████████| 250/250 [00:23<00:00, 10.71it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.64it/s]


Epoch 9/10:
Train Loss: 1.2526 | Val Loss: 1.2213


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.63it/s]
[I 2025-05-10 11:36:10,864] Trial 17 finished with value: 1.205317082859221 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10648203488475591, 'learning_rate': 0.000550520386756566, 'batch_size': 32}. Best is trial 16 with value: 1.1857713025713723.


Epoch 10/10:
Train Loss: 1.2049 | Val Loss: 1.2053


Training: 100%|██████████| 250/250 [00:23<00:00, 10.71it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.35it/s]


Epoch 1/10:
Train Loss: 2.3295 | Val Loss: 2.0296


Training: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.61it/s]


Epoch 2/10:
Train Loss: 1.9429 | Val Loss: 1.8053


Training: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.40it/s]


Epoch 3/10:
Train Loss: 1.7554 | Val Loss: 1.6744


Training: 100%|██████████| 250/250 [00:23<00:00, 10.66it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.42it/s]


Epoch 4/10:
Train Loss: 1.6373 | Val Loss: 1.5791


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.54it/s]


Epoch 5/10:
Train Loss: 1.5447 | Val Loss: 1.5188


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.47it/s]


Epoch 6/10:
Train Loss: 1.4670 | Val Loss: 1.4116


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.57it/s]


Epoch 7/10:
Train Loss: 1.4013 | Val Loss: 1.3536


Training: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.61it/s]


Epoch 8/10:
Train Loss: 1.3462 | Val Loss: 1.3207


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.55it/s]


Epoch 9/10:
Train Loss: 1.2968 | Val Loss: 1.2698


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.34it/s]
[I 2025-05-10 11:40:27,022] Trial 18 finished with value: 1.2462230901869515 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.11236849941807808, 'learning_rate': 0.00047211273962923314, 'batch_size': 32}. Best is trial 16 with value: 1.1857713025713723.


Epoch 10/10:
Train Loss: 1.2502 | Val Loss: 1.2462


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.56it/s]


Epoch 1/10:
Train Loss: 2.3333 | Val Loss: 2.0210


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.54it/s]


Epoch 2/10:
Train Loss: 1.9528 | Val Loss: 1.8115


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.63it/s]


Epoch 3/10:
Train Loss: 1.7718 | Val Loss: 1.6521


Training: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.61it/s]


Epoch 4/10:
Train Loss: 1.6545 | Val Loss: 1.5811


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.62it/s]


Epoch 5/10:
Train Loss: 1.5650 | Val Loss: 1.5071


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.49it/s]


Epoch 6/10:
Train Loss: 1.4878 | Val Loss: 1.4308


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.54it/s]


Epoch 7/10:
Train Loss: 1.4251 | Val Loss: 1.3698


Training: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.47it/s]


Epoch 8/10:
Train Loss: 1.3641 | Val Loss: 1.3196


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.52it/s]


Epoch 9/10:
Train Loss: 1.3169 | Val Loss: 1.2576


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.61it/s]
[I 2025-05-10 11:44:43,088] Trial 19 finished with value: 1.235966824349903 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1508363527528604, 'learning_rate': 0.0005958421572690186, 'batch_size': 32}. Best is trial 16 with value: 1.1857713025713723.


Epoch 10/10:
Train Loss: 1.2685 | Val Loss: 1.2360


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.60it/s]


Epoch 1/10:
Train Loss: 2.3033 | Val Loss: 1.9965


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.48it/s]


Epoch 2/10:
Train Loss: 1.9055 | Val Loss: 1.7887


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.58it/s]


Epoch 3/10:
Train Loss: 1.7465 | Val Loss: 1.6863


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.52it/s]


Epoch 4/10:
Train Loss: 1.6268 | Val Loss: 1.5678


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.61it/s]


Epoch 5/10:
Train Loss: 1.5364 | Val Loss: 1.4686


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.44it/s]


Epoch 6/10:
Train Loss: 1.4651 | Val Loss: 1.4502


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.54it/s]


Epoch 7/10:
Train Loss: 1.4016 | Val Loss: 1.3565


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.58it/s]


Epoch 8/10:
Train Loss: 1.3449 | Val Loss: 1.3170


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.68it/s]


Epoch 9/10:
Train Loss: 1.2994 | Val Loss: 1.2803


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.39it/s]
[I 2025-05-10 11:48:59,199] Trial 20 finished with value: 1.2506908859525407 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.13738123588159729, 'learning_rate': 0.0006862358666004544, 'batch_size': 32}. Best is trial 16 with value: 1.1857713025713723.


Epoch 10/10:
Train Loss: 1.2565 | Val Loss: 1.2507


Training: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.64it/s]


Epoch 1/10:
Train Loss: 2.3093 | Val Loss: 2.0766


Training: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.59it/s]


Epoch 2/10:
Train Loss: 1.9382 | Val Loss: 1.8300


Training: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.63it/s]


Epoch 3/10:
Train Loss: 1.7625 | Val Loss: 1.6833


Training: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.51it/s]


Epoch 4/10:
Train Loss: 1.6442 | Val Loss: 1.6055


Training: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.56it/s]


Epoch 5/10:
Train Loss: 1.5505 | Val Loss: 1.5083


Training: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.55it/s]


Epoch 6/10:
Train Loss: 1.4765 | Val Loss: 1.4004


Training: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.66it/s]


Epoch 7/10:
Train Loss: 1.4052 | Val Loss: 1.3630


Training: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.56it/s]


Epoch 8/10:
Train Loss: 1.3447 | Val Loss: 1.3117


Training: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.67it/s]


Epoch 9/10:
Train Loss: 1.2839 | Val Loss: 1.2599


Training: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.55it/s]
[I 2025-05-10 11:55:58,983] Trial 21 finished with value: 1.2146695946890211 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.21340817141939178, 'learning_rate': 0.00024742182656037375, 'batch_size': 32}. Best is trial 16 with value: 1.1857713025713723.


Epoch 10/10:
Train Loss: 1.2364 | Val Loss: 1.2147


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.53it/s]


Epoch 1/10:
Train Loss: 2.4925 | Val Loss: 2.1403


Training: 100%|██████████| 250/250 [00:23<00:00, 10.65it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.54it/s]


Epoch 2/10:
Train Loss: 2.1000 | Val Loss: 1.9938


Training: 100%|██████████| 250/250 [00:23<00:00, 10.66it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.51it/s]


Epoch 3/10:
Train Loss: 1.9585 | Val Loss: 1.8488


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.49it/s]


Epoch 4/10:
Train Loss: 1.8496 | Val Loss: 1.7634


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.54it/s]


Epoch 5/10:
Train Loss: 1.7654 | Val Loss: 1.6799


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.45it/s]


Epoch 6/10:
Train Loss: 1.6972 | Val Loss: 1.6099


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.55it/s]


Epoch 7/10:
Train Loss: 1.6369 | Val Loss: 1.5655


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.41it/s]


Epoch 8/10:
Train Loss: 1.5855 | Val Loss: 1.5059


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.46it/s]


Epoch 9/10:
Train Loss: 1.5438 | Val Loss: 1.4837


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.58it/s]
[I 2025-05-10 12:00:15,471] Trial 22 finished with value: 1.463464615836976 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.16784634237486684, 'learning_rate': 0.000239943786492506, 'batch_size': 32}. Best is trial 16 with value: 1.1857713025713723.


Epoch 10/10:
Train Loss: 1.5036 | Val Loss: 1.4635


Training: 100%|██████████| 250/250 [00:32<00:00,  7.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.50it/s]


Epoch 1/10:
Train Loss: 2.2318 | Val Loss: 1.9034


Training: 100%|██████████| 250/250 [00:32<00:00,  7.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.40it/s]


Epoch 2/10:
Train Loss: 1.7919 | Val Loss: 1.6897


Training: 100%|██████████| 250/250 [00:32<00:00,  7.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.44it/s]


Epoch 3/10:
Train Loss: 1.6291 | Val Loss: 1.5709


Training: 100%|██████████| 250/250 [00:32<00:00,  7.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.41it/s]


Epoch 4/10:
Train Loss: 1.5012 | Val Loss: 1.4409


Training: 100%|██████████| 250/250 [00:32<00:00,  7.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.41it/s]


Epoch 5/10:
Train Loss: 1.3945 | Val Loss: 1.3958


Training: 100%|██████████| 250/250 [00:32<00:00,  7.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.40it/s]


Epoch 6/10:
Train Loss: 1.3032 | Val Loss: 1.2938


Training: 100%|██████████| 250/250 [00:32<00:00,  7.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.39it/s]


Epoch 7/10:
Train Loss: 1.2307 | Val Loss: 1.2512


Training: 100%|██████████| 250/250 [00:32<00:00,  7.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.38it/s]


Epoch 8/10:
Train Loss: 1.1632 | Val Loss: 1.1885


Training: 100%|██████████| 250/250 [00:32<00:00,  7.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.43it/s]


Epoch 9/10:
Train Loss: 1.1039 | Val Loss: 1.1480


Training: 100%|██████████| 250/250 [00:32<00:00,  7.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.49it/s]


Epoch 10/10:
Train Loss: 1.0489 | Val Loss: 1.1274


[I 2025-05-10 12:06:15,424] Trial 23 finished with value: 1.1273557960040985 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10314698627725886, 'learning_rate': 0.0006479351538358051, 'batch_size': 32}. Best is trial 23 with value: 1.1273557960040985.


New best model found! Val Loss: 1.1274
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10314698627725886, 'learning_rate': 0.0006479351538358051, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:32<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.52it/s]


Epoch 1/10:
Train Loss: 3.0095 | Val Loss: 2.9789


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.55it/s]


Epoch 2/10:
Train Loss: 2.9816 | Val Loss: 2.9993


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.57it/s]


Epoch 3/10:
Train Loss: 2.9565 | Val Loss: 3.3240


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.55it/s]
[I 2025-05-10 12:08:39,863] Trial 24 finished with value: 2.9789348329816545 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10187675394269682, 'learning_rate': 0.0014576722654048302, 'batch_size': 32}. Best is trial 23 with value: 1.1273557960040985.


Epoch 4/10:
Train Loss: 2.9409 | Val Loss: 3.4632
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.46it/s]


Epoch 1/10:
Train Loss: 2.3008 | Val Loss: 2.0118


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.39it/s]


Epoch 2/10:
Train Loss: 1.9242 | Val Loss: 1.8048


Training: 100%|██████████| 250/250 [00:23<00:00, 10.66it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.59it/s]


Epoch 3/10:
Train Loss: 1.7637 | Val Loss: 1.6605


Training: 100%|██████████| 250/250 [00:23<00:00, 10.66it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.50it/s]


Epoch 4/10:
Train Loss: 1.6440 | Val Loss: 1.5783


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.45it/s]


Epoch 5/10:
Train Loss: 1.5540 | Val Loss: 1.5155


Training: 100%|██████████| 250/250 [00:23<00:00, 10.66it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.37it/s]


Epoch 6/10:
Train Loss: 1.4827 | Val Loss: 1.4638


Training: 100%|██████████| 250/250 [00:23<00:00, 10.64it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.33it/s]


Epoch 7/10:
Train Loss: 1.4145 | Val Loss: 1.3902


Training: 100%|██████████| 250/250 [00:23<00:00, 10.65it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.40it/s]


Epoch 8/10:
Train Loss: 1.3588 | Val Loss: 1.3212


Training: 100%|██████████| 250/250 [00:23<00:00, 10.65it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.52it/s]


Epoch 9/10:
Train Loss: 1.3061 | Val Loss: 1.2563


Training: 100%|██████████| 250/250 [00:23<00:00, 10.65it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.46it/s]
[I 2025-05-10 12:12:56,668] Trial 25 finished with value: 1.2360246673462882 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.16101987325442874, 'learning_rate': 0.0007698560213600815, 'batch_size': 32}. Best is trial 23 with value: 1.1273557960040985.


Epoch 10/10:
Train Loss: 1.2633 | Val Loss: 1.2360


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.43it/s]


Epoch 1/10:
Train Loss: 3.0102 | Val Loss: 2.9780


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.43it/s]


Epoch 2/10:
Train Loss: 2.9779 | Val Loss: 3.0583


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.42it/s]


Epoch 3/10:
Train Loss: 2.9267 | Val Loss: 3.4783


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.43it/s]
[I 2025-05-10 12:19:19,105] Trial 26 finished with value: 2.9780230824909513 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12900865354222896, 'learning_rate': 0.001459686880298595, 'batch_size': 32}. Best is trial 23 with value: 1.1273557960040985.


Epoch 4/10:
Train Loss: 2.9072 | Val Loss: 3.4758
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:23<00:00, 10.66it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.57it/s]


Epoch 1/10:
Train Loss: 2.3235 | Val Loss: 2.0135


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.44it/s]


Epoch 2/10:
Train Loss: 1.9333 | Val Loss: 1.8282


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.46it/s]


Epoch 3/10:
Train Loss: 1.7592 | Val Loss: 1.6406


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.61it/s]


Epoch 4/10:
Train Loss: 1.6350 | Val Loss: 1.5510


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.46it/s]


Epoch 5/10:
Train Loss: 1.5408 | Val Loss: 1.4724


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.53it/s]


Epoch 6/10:
Train Loss: 1.4588 | Val Loss: 1.3942


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.41it/s]


Epoch 7/10:
Train Loss: 1.3925 | Val Loss: 1.3517


Training: 100%|██████████| 250/250 [00:23<00:00, 10.66it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.56it/s]


Epoch 8/10:
Train Loss: 1.3321 | Val Loss: 1.3020


Training: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.59it/s]


Epoch 9/10:
Train Loss: 1.2759 | Val Loss: 1.2745


Training: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.56it/s]
[I 2025-05-10 12:23:35,531] Trial 27 finished with value: 1.2193537572073558 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10280282192593812, 'learning_rate': 0.0005142335938258826, 'batch_size': 32}. Best is trial 23 with value: 1.1273557960040985.


Epoch 10/10:
Train Loss: 1.2306 | Val Loss: 1.2194


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.49it/s]


Epoch 1/10:
Train Loss: 2.3424 | Val Loss: 1.9798


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.55it/s]


Epoch 2/10:
Train Loss: 1.9039 | Val Loss: 1.8050


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.55it/s]


Epoch 3/10:
Train Loss: 1.7633 | Val Loss: 1.7125


Training: 100%|██████████| 250/250 [00:33<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.52it/s]


Epoch 4/10:
Train Loss: 1.6690 | Val Loss: 1.6313


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.53it/s]


Epoch 5/10:
Train Loss: 1.5997 | Val Loss: 1.5569


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.54it/s]


Epoch 6/10:
Train Loss: 1.5353 | Val Loss: 1.5143


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.52it/s]


Epoch 7/10:
Train Loss: 1.4881 | Val Loss: 1.4772


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.57it/s]


Epoch 8/10:
Train Loss: 1.4411 | Val Loss: 1.4467


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.55it/s]


Epoch 9/10:
Train Loss: 1.4017 | Val Loss: 1.4169


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.54it/s]
[I 2025-05-10 12:29:36,534] Trial 28 finished with value: 1.4064532102100433 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.17537248491483415, 'learning_rate': 0.0007744584223395755, 'batch_size': 32}. Best is trial 23 with value: 1.1273557960040985.


Epoch 10/10:
Train Loss: 1.3645 | Val Loss: 1.4065


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.30it/s]


Epoch 1/10:
Train Loss: 2.2436 | Val Loss: 1.9449


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.37it/s]


Epoch 2/10:
Train Loss: 1.8428 | Val Loss: 1.7043


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.32it/s]


Epoch 3/10:
Train Loss: 1.6638 | Val Loss: 1.6109


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.25it/s]


Epoch 4/10:
Train Loss: 1.5392 | Val Loss: 1.4813


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.31it/s]


Epoch 5/10:
Train Loss: 1.4413 | Val Loss: 1.4004


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.36it/s]


Epoch 6/10:
Train Loss: 1.3516 | Val Loss: 1.3335


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.36it/s]


Epoch 7/10:
Train Loss: 1.2821 | Val Loss: 1.2949


Training: 100%|██████████| 250/250 [00:32<00:00,  7.58it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.31it/s]


Epoch 8/10:
Train Loss: 1.2133 | Val Loss: 1.2169


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.27it/s]


Epoch 9/10:
Train Loss: 1.1541 | Val Loss: 1.1713


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.24it/s]
[I 2025-05-10 12:35:38,263] Trial 29 finished with value: 1.1269347431167724 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.14668207536224145, 'learning_rate': 0.00045326060908640873, 'batch_size': 32}. Best is trial 29 with value: 1.1269347431167724.


Epoch 10/10:
Train Loss: 1.0960 | Val Loss: 1.1269
New best model found! Val Loss: 1.1269
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.14668207536224145, 'learning_rate': 0.00045326060908640873, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.19it/s]


Epoch 1/10:
Train Loss: 2.3222 | Val Loss: 2.0423


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.22it/s]


Epoch 2/10:
Train Loss: 1.9529 | Val Loss: 1.8105


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.25it/s]


Epoch 3/10:
Train Loss: 1.7754 | Val Loss: 1.7010


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.21it/s]


Epoch 4/10:
Train Loss: 1.6448 | Val Loss: 1.5741


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.19it/s]


Epoch 5/10:
Train Loss: 1.5446 | Val Loss: 1.4557


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.23it/s]


Epoch 6/10:
Train Loss: 1.4563 | Val Loss: 1.3986


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.25it/s]


Epoch 7/10:
Train Loss: 1.3785 | Val Loss: 1.3207


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.30it/s]


Epoch 8/10:
Train Loss: 1.3124 | Val Loss: 1.2825


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.28it/s]


Epoch 9/10:
Train Loss: 1.2569 | Val Loss: 1.2160


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.25it/s]
[I 2025-05-10 12:41:40,296] Trial 30 finished with value: 1.1852355079045371 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.145162843804256, 'learning_rate': 0.00018618478208983576, 'batch_size': 32}. Best is trial 29 with value: 1.1269347431167724.


Epoch 10/10:
Train Loss: 1.2056 | Val Loss: 1.1852


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.25it/s]


Epoch 1/10:
Train Loss: 2.3126 | Val Loss: 2.0665


Training: 100%|██████████| 250/250 [00:33<00:00,  7.55it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.28it/s]


Epoch 2/10:
Train Loss: 1.9467 | Val Loss: 1.8067


Training: 100%|██████████| 250/250 [00:33<00:00,  7.55it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.22it/s]


Epoch 3/10:
Train Loss: 1.7636 | Val Loss: 1.6631


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.27it/s]


Epoch 4/10:
Train Loss: 1.6369 | Val Loss: 1.5569


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.30it/s]


Epoch 5/10:
Train Loss: 1.5332 | Val Loss: 1.4852


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.29it/s]


Epoch 6/10:
Train Loss: 1.4499 | Val Loss: 1.3898


Training: 100%|██████████| 250/250 [00:33<00:00,  7.55it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.24it/s]


Epoch 7/10:
Train Loss: 1.3738 | Val Loss: 1.3224


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.32it/s]


Epoch 8/10:
Train Loss: 1.3107 | Val Loss: 1.2752


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.28it/s]


Epoch 9/10:
Train Loss: 1.2555 | Val Loss: 1.2314


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.18it/s]
[I 2025-05-10 12:47:42,371] Trial 31 finished with value: 1.184375592640468 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.14460185704638515, 'learning_rate': 0.00020455041261838159, 'batch_size': 32}. Best is trial 29 with value: 1.1269347431167724.


Epoch 10/10:
Train Loss: 1.2039 | Val Loss: 1.1844


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.28it/s]


Epoch 1/10:
Train Loss: 2.3084 | Val Loss: 2.0175


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.25it/s]


Epoch 2/10:
Train Loss: 1.9254 | Val Loss: 1.7922


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.21it/s]


Epoch 3/10:
Train Loss: 1.7421 | Val Loss: 1.7022


Training: 100%|██████████| 250/250 [00:33<00:00,  7.55it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.26it/s]


Epoch 4/10:
Train Loss: 1.6149 | Val Loss: 1.5497


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.29it/s]


Epoch 5/10:
Train Loss: 1.5071 | Val Loss: 1.4148


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.23it/s]


Epoch 6/10:
Train Loss: 1.4136 | Val Loss: 1.3420


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.20it/s]


Epoch 7/10:
Train Loss: 1.3389 | Val Loss: 1.2868


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.27it/s]


Epoch 8/10:
Train Loss: 1.2730 | Val Loss: 1.2396


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.22it/s]


Epoch 9/10:
Train Loss: 1.2148 | Val Loss: 1.1988


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.19it/s]
[I 2025-05-10 12:53:44,367] Trial 32 finished with value: 1.1468715431198242 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.14779579655038938, 'learning_rate': 0.00020591670454448078, 'batch_size': 32}. Best is trial 29 with value: 1.1269347431167724.


Epoch 10/10:
Train Loss: 1.1678 | Val Loss: 1.1469


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.27it/s]


Epoch 1/10:
Train Loss: 2.3126 | Val Loss: 2.0371


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.22it/s]


Epoch 2/10:
Train Loss: 1.9501 | Val Loss: 1.8188


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.33it/s]


Epoch 3/10:
Train Loss: 1.7694 | Val Loss: 1.6944


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.32it/s]


Epoch 4/10:
Train Loss: 1.6537 | Val Loss: 1.5945


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.28it/s]


Epoch 5/10:
Train Loss: 1.5584 | Val Loss: 1.5162


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.29it/s]


Epoch 6/10:
Train Loss: 1.4765 | Val Loss: 1.4136


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.26it/s]


Epoch 7/10:
Train Loss: 1.4087 | Val Loss: 1.3398


Training: 100%|██████████| 250/250 [00:33<00:00,  7.56it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.26it/s]


Epoch 8/10:
Train Loss: 1.3449 | Val Loss: 1.2928


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.33it/s]


Epoch 9/10:
Train Loss: 1.2917 | Val Loss: 1.2373


Training: 100%|██████████| 250/250 [00:33<00:00,  7.57it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.36it/s]
[I 2025-05-10 12:59:46,086] Trial 33 finished with value: 1.2083797360223436 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.20799273069451213, 'learning_rate': 0.0002561001970682596, 'batch_size': 32}. Best is trial 29 with value: 1.1269347431167724.


Epoch 10/10:
Train Loss: 1.2380 | Val Loss: 1.2084


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.59it/s]


Epoch 1/10:
Train Loss: 2.3313 | Val Loss: 2.0306


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.61it/s]


Epoch 2/10:
Train Loss: 1.9543 | Val Loss: 1.8392


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.63it/s]


Epoch 3/10:
Train Loss: 1.7710 | Val Loss: 1.6653


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.60it/s]


Epoch 4/10:
Train Loss: 1.6330 | Val Loss: 1.5532


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.60it/s]


Epoch 5/10:
Train Loss: 1.5206 | Val Loss: 1.4675


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.63it/s]


Epoch 6/10:
Train Loss: 1.4375 | Val Loss: 1.3975


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.60it/s]


Epoch 7/10:
Train Loss: 1.3561 | Val Loss: 1.3623


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.60it/s]


Epoch 8/10:
Train Loss: 1.2921 | Val Loss: 1.3139


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.61it/s]


Epoch 9/10:
Train Loss: 1.2254 | Val Loss: 1.2279


Training: 100%|██████████| 250/250 [00:54<00:00,  4.59it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.59it/s]
[I 2025-05-10 13:09:40,881] Trial 34 finished with value: 1.215926342540317 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.12597060296741497, 'learning_rate': 0.00017949422786023455, 'batch_size': 32}. Best is trial 29 with value: 1.1269347431167724.


Epoch 10/10:
Train Loss: 1.1671 | Val Loss: 1.2159


Training: 100%|██████████| 250/250 [01:26<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 1/10:
Train Loss: 2.2636 | Val Loss: 1.9936


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 2/10:
Train Loss: 1.8687 | Val Loss: 1.7228


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.40it/s]


Epoch 3/10:
Train Loss: 1.6676 | Val Loss: 1.5980


Training: 100%|██████████| 250/250 [01:26<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.42it/s]


Epoch 4/10:
Train Loss: 1.5289 | Val Loss: 1.4988


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.40it/s]


Epoch 5/10:
Train Loss: 1.4277 | Val Loss: 1.3844


Training: 100%|██████████| 250/250 [01:26<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.41it/s]


Epoch 6/10:
Train Loss: 1.3347 | Val Loss: 1.3234


Training: 100%|██████████| 250/250 [01:26<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 7/10:
Train Loss: 1.2576 | Val Loss: 1.2490


Training: 100%|██████████| 250/250 [01:26<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.40it/s]


Epoch 8/10:
Train Loss: 1.1899 | Val Loss: 1.1851


Training: 100%|██████████| 250/250 [01:26<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.40it/s]


Epoch 9/10:
Train Loss: 1.1259 | Val Loss: 1.1685


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]
[I 2025-05-10 13:25:36,286] Trial 35 finished with value: 1.1070986938855005 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1544896129243996, 'learning_rate': 0.0003238880172055884, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 10/10:
Train Loss: 1.0672 | Val Loss: 1.1071
New best model found! Val Loss: 1.1071
Config: {'d_model': 256, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1544896129243996, 'learning_rate': 0.0003238880172055884, 'batch_size': 32}


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 1/10:
Train Loss: 2.3832 | Val Loss: 2.1224


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 2/10:
Train Loss: 1.9590 | Val Loss: 1.8552


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 3/10:
Train Loss: 1.7764 | Val Loss: 1.7002


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.89it/s]


Epoch 4/10:
Train Loss: 1.6492 | Val Loss: 1.5939


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 5/10:
Train Loss: 1.5555 | Val Loss: 1.5347


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 6/10:
Train Loss: 1.4752 | Val Loss: 1.4894


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 7/10:
Train Loss: 1.4045 | Val Loss: 1.4174


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 8/10:
Train Loss: 1.3429 | Val Loss: 1.3846


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]


Epoch 9/10:
Train Loss: 1.2801 | Val Loss: 1.3268


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.89it/s]
[I 2025-05-10 13:56:13,466] Trial 36 finished with value: 1.3060408035914104 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.18858153756893542, 'learning_rate': 0.00028521974328126815, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 10/10:
Train Loss: 1.2304 | Val Loss: 1.3060


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.40it/s]


Epoch 1/10:
Train Loss: 2.2523 | Val Loss: 1.9930


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 2/10:
Train Loss: 1.8789 | Val Loss: 1.7438


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 3/10:
Train Loss: 1.6874 | Val Loss: 1.5956


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 4/10:
Train Loss: 1.5528 | Val Loss: 1.4862


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 5/10:
Train Loss: 1.4440 | Val Loss: 1.4010


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 6/10:
Train Loss: 1.3508 | Val Loss: 1.3543


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.38it/s]


Epoch 7/10:
Train Loss: 1.2749 | Val Loss: 1.2788


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.38it/s]


Epoch 8/10:
Train Loss: 1.2046 | Val Loss: 1.2350


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]


Epoch 9/10:
Train Loss: 1.1423 | Val Loss: 1.1980


Training: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.39it/s]
[I 2025-05-10 14:12:10,308] Trial 37 finished with value: 1.1658859366462344 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.16447530875693694, 'learning_rate': 0.0003826581195303293, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 10/10:
Train Loss: 1.0868 | Val Loss: 1.1659


Training: 100%|██████████| 250/250 [01:37<00:00,  2.56it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.50it/s]


Epoch 1/10:
Train Loss: 3.0203 | Val Loss: 2.9758


Training: 100%|██████████| 250/250 [01:37<00:00,  2.56it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.53it/s]


Epoch 2/10:
Train Loss: 2.9827 | Val Loss: 2.9762


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.52it/s]


Epoch 3/10:
Train Loss: 2.9712 | Val Loss: 3.0279


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.51it/s]
[I 2025-05-10 14:19:14,165] Trial 38 finished with value: 2.975821540469215 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.12032708780750531, 'learning_rate': 0.0010084655645360104, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 4/10:
Train Loss: 2.9664 | Val Loss: 3.0713
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.91it/s]


Epoch 1/10:
Train Loss: 3.0122 | Val Loss: 2.9757


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.91it/s]


Epoch 2/10:
Train Loss: 2.9673 | Val Loss: 3.2573


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.91it/s]


Epoch 3/10:
Train Loss: 2.9145 | Val Loss: 3.8236


Training: 100%|██████████| 250/250 [02:47<00:00,  1.49it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.91it/s]
[I 2025-05-10 14:31:28,930] Trial 39 finished with value: 2.9756542387462797 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.18004403580041609, 'learning_rate': 0.0011245684056705957, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 4/10:
Train Loss: 2.8977 | Val Loss: 3.8057
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.81it/s]


Epoch 1/10:
Train Loss: 2.2568 | Val Loss: 1.8900


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 2/10:
Train Loss: 1.8030 | Val Loss: 1.6798


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.82it/s]


Epoch 3/10:
Train Loss: 1.6287 | Val Loss: 1.5799


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.82it/s]


Epoch 4/10:
Train Loss: 1.5117 | Val Loss: 1.5456


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.82it/s]


Epoch 5/10:
Train Loss: 1.4051 | Val Loss: 1.4292


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 6/10:
Train Loss: 1.3194 | Val Loss: 1.3275


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 7/10:
Train Loss: 1.2434 | Val Loss: 1.2797


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.82it/s]


Epoch 8/10:
Train Loss: 1.1735 | Val Loss: 1.2333


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.82it/s]


Epoch 9/10:
Train Loss: 1.1125 | Val Loss: 1.2152


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]
[I 2025-05-10 14:52:40,673] Trial 40 finished with value: 1.1637130010695684 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.216184958498792, 'learning_rate': 0.0003259394326613092, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 10/10:
Train Loss: 1.0583 | Val Loss: 1.1637


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 1/10:
Train Loss: 2.3262 | Val Loss: 2.0539


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 2/10:
Train Loss: 1.9734 | Val Loss: 1.8814


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 3/10:
Train Loss: 1.7867 | Val Loss: 1.7186


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.82it/s]


Epoch 4/10:
Train Loss: 1.6547 | Val Loss: 1.5810


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 5/10:
Train Loss: 1.5504 | Val Loss: 1.4860


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 6/10:
Train Loss: 1.4582 | Val Loss: 1.4070


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 7/10:
Train Loss: 1.3769 | Val Loss: 1.3520


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 8/10:
Train Loss: 1.3020 | Val Loss: 1.2692


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 9/10:
Train Loss: 1.2380 | Val Loss: 1.2336


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.84it/s]
[I 2025-05-10 15:13:51,010] Trial 41 finished with value: 1.1880737675560846 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.22601704549867682, 'learning_rate': 0.00010136169293700379, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 10/10:
Train Loss: 1.1819 | Val Loss: 1.1881


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.84it/s]


Epoch 1/10:
Train Loss: 2.2965 | Val Loss: 1.9596


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 2/10:
Train Loss: 1.8462 | Val Loss: 1.7399


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 3/10:
Train Loss: 1.6800 | Val Loss: 1.6069


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.84it/s]


Epoch 4/10:
Train Loss: 1.5701 | Val Loss: 1.5308


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.85it/s]


Epoch 5/10:
Train Loss: 1.4768 | Val Loss: 1.4530


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.84it/s]


Epoch 6/10:
Train Loss: 1.3909 | Val Loss: 1.4175


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.84it/s]


Epoch 7/10:
Train Loss: 1.3172 | Val Loss: 1.3331


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.85it/s]


Epoch 8/10:
Train Loss: 1.2576 | Val Loss: 1.3457


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.83it/s]


Epoch 9/10:
Train Loss: 1.2040 | Val Loss: 1.2780


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.84it/s]
[I 2025-05-10 15:35:00,432] Trial 42 finished with value: 1.2281707127888997 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.272715257698762, 'learning_rate': 0.00032470396867513145, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 10/10:
Train Loss: 1.1521 | Val Loss: 1.2282


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.86it/s]


Epoch 1/10:
Train Loss: 2.5634 | Val Loss: 2.0618


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.87it/s]


Epoch 2/10:
Train Loss: 1.9140 | Val Loss: 1.7748


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.88it/s]


Epoch 3/10:
Train Loss: 1.7240 | Val Loss: 1.6698


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.87it/s]


Epoch 4/10:
Train Loss: 1.6090 | Val Loss: 1.5690


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.89it/s]


Epoch 5/10:
Train Loss: 1.5162 | Val Loss: 1.5267


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.88it/s]


Epoch 6/10:
Train Loss: 1.4462 | Val Loss: 1.4696


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.88it/s]


Epoch 7/10:
Train Loss: 1.3750 | Val Loss: 1.4312


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.88it/s]


Epoch 8/10:
Train Loss: 1.3139 | Val Loss: 1.4321


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.88it/s]


Epoch 9/10:
Train Loss: 1.2582 | Val Loss: 1.5015


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.87it/s]
[I 2025-05-10 15:56:08,894] Trial 43 finished with value: 1.4312197991779871 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.19612326795017532, 'learning_rate': 0.000433874312453318, 'batch_size': 32}. Best is trial 35 with value: 1.1070986938855005.


Epoch 10/10:
Train Loss: 1.2044 | Val Loss: 1.4851
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 1/10:
Train Loss: 2.1888 | Val Loss: 1.8537


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]


Epoch 2/10:
Train Loss: 1.6790 | Val Loss: 1.4846


Training: 100%|██████████| 250/250 [00:58<00:00,  4.26it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]


Epoch 3/10:
Train Loss: 1.4332 | Val Loss: 1.3388


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]


Epoch 4/10:
Train Loss: 1.2998 | Val Loss: 1.2542


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 5/10:
Train Loss: 1.2058 | Val Loss: 1.1969


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 6/10:
Train Loss: 1.1316 | Val Loss: 1.1427


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.32it/s]


Epoch 7/10:
Train Loss: 1.0680 | Val Loss: 1.0990


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 8/10:
Train Loss: 1.0091 | Val Loss: 1.0683


Training: 100%|██████████| 250/250 [00:58<00:00,  4.25it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]


Epoch 9/10:
Train Loss: 0.9569 | Val Loss: 1.0455


Training: 100%|██████████| 250/250 [00:58<00:00,  4.26it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 10/10:
Train Loss: 0.9062 | Val Loss: 1.0112


[I 2025-05-10 16:06:48,592] Trial 44 finished with value: 1.0111583121239194 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1553364410135958, 'learning_rate': 0.00021612168589772826, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


New best model found! Val Loss: 1.0112
Config: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1553364410135958, 'learning_rate': 0.00021612168589772826, 'batch_size': 32}


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.74it/s]


Epoch 1/10:
Train Loss: 2.3048 | Val Loss: 1.9741


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.75it/s]


Epoch 2/10:
Train Loss: 1.8676 | Val Loss: 1.7469


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.73it/s]


Epoch 3/10:
Train Loss: 1.6736 | Val Loss: 1.5840


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.75it/s]


Epoch 4/10:
Train Loss: 1.5326 | Val Loss: 1.4756


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.73it/s]


Epoch 5/10:
Train Loss: 1.4179 | Val Loss: 1.3829


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.73it/s]


Epoch 6/10:
Train Loss: 1.3262 | Val Loss: 1.3176


Training: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.71it/s]


Epoch 7/10:
Train Loss: 1.2390 | Val Loss: 1.2781


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.73it/s]


Epoch 8/10:
Train Loss: 1.1655 | Val Loss: 1.1602


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.72it/s]


Epoch 9/10:
Train Loss: 1.0969 | Val Loss: 1.1414


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.72it/s]
[I 2025-05-10 16:23:15,891] Trial 45 finished with value: 1.113016200444055 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.15341492489431763, 'learning_rate': 0.00013433347936281362, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0368 | Val Loss: 1.1130


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 1/10:
Train Loss: 2.2946 | Val Loss: 2.0020


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 2/10:
Train Loss: 1.8879 | Val Loss: 1.7996


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 3/10:
Train Loss: 1.7110 | Val Loss: 1.6149


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 4/10:
Train Loss: 1.5734 | Val Loss: 1.5827


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 5/10:
Train Loss: 1.4641 | Val Loss: 1.4508


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 6/10:
Train Loss: 1.3670 | Val Loss: 1.3570


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 7/10:
Train Loss: 1.2763 | Val Loss: 1.2559


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 8/10:
Train Loss: 1.1945 | Val Loss: 1.2075


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 9/10:
Train Loss: 1.1233 | Val Loss: 1.1628


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]
[I 2025-05-10 16:39:35,451] Trial 46 finished with value: 1.1125647331041002 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.15916712929840163, 'learning_rate': 0.00012458387469032502, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0589 | Val Loss: 1.1126


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 1/10:
Train Loss: 2.3927 | Val Loss: 2.0914


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 2/10:
Train Loss: 2.0047 | Val Loss: 1.9458


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 3/10:
Train Loss: 1.8592 | Val Loss: 1.9126


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 4/10:
Train Loss: 1.7556 | Val Loss: 1.7469


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 5/10:
Train Loss: 1.6701 | Val Loss: 1.7392


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 6/10:
Train Loss: 1.6045 | Val Loss: 1.6539


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 7/10:
Train Loss: 1.5421 | Val Loss: 1.5627


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 8/10:
Train Loss: 1.4923 | Val Loss: 1.4996


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 9/10:
Train Loss: 1.4434 | Val Loss: 1.4483


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]
[I 2025-05-10 16:55:55,083] Trial 47 finished with value: 1.4482672555106026 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.33109453950448475, 'learning_rate': 0.00013642172601853852, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.4022 | Val Loss: 1.5008


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 1/10:
Train Loss: 2.2986 | Val Loss: 1.9786


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 2/10:
Train Loss: 1.8628 | Val Loss: 1.7491


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 3/10:
Train Loss: 1.6684 | Val Loss: 1.5949


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 4/10:
Train Loss: 1.5267 | Val Loss: 1.4568


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 5/10:
Train Loss: 1.3986 | Val Loss: 1.3524


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 6/10:
Train Loss: 1.2978 | Val Loss: 1.2740


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 7/10:
Train Loss: 1.2096 | Val Loss: 1.2094


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 8/10:
Train Loss: 1.1342 | Val Loss: 1.1568


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.89it/s]


Epoch 9/10:
Train Loss: 1.0690 | Val Loss: 1.1103


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]
[I 2025-05-10 17:12:14,891] Trial 48 finished with value: 1.08685683068775 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.15854109099744904, 'learning_rate': 0.00012103888175822555, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0070 | Val Loss: 1.0869


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 1/10:
Train Loss: 2.3015 | Val Loss: 2.0761


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 2/10:
Train Loss: 1.8881 | Val Loss: 1.8025


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 3/10:
Train Loss: 1.6838 | Val Loss: 1.5914


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 4/10:
Train Loss: 1.5293 | Val Loss: 1.4408


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 5/10:
Train Loss: 1.4060 | Val Loss: 1.3685


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.89it/s]


Epoch 6/10:
Train Loss: 1.3124 | Val Loss: 1.2692


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 7/10:
Train Loss: 1.2364 | Val Loss: 1.2031


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.89it/s]


Epoch 8/10:
Train Loss: 1.1704 | Val Loss: 1.1637


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.89it/s]


Epoch 9/10:
Train Loss: 1.1081 | Val Loss: 1.1280


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]
[I 2025-05-10 17:28:34,846] Trial 49 finished with value: 1.0999813146061368 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.19760261213421473, 'learning_rate': 0.00012137132837354166, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0577 | Val Loss: 1.1000


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.76it/s]


Epoch 1/10:
Train Loss: 2.3072 | Val Loss: 1.9942


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.75it/s]


Epoch 2/10:
Train Loss: 1.8616 | Val Loss: 1.7311


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.75it/s]


Epoch 3/10:
Train Loss: 1.6872 | Val Loss: 1.6934


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.72it/s]


Epoch 4/10:
Train Loss: 1.5618 | Val Loss: 1.5931


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.74it/s]


Epoch 5/10:
Train Loss: 1.4548 | Val Loss: 1.4801


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.75it/s]


Epoch 6/10:
Train Loss: 1.3742 | Val Loss: 1.3942


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.73it/s]


Epoch 7/10:
Train Loss: 1.2979 | Val Loss: 1.3195


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.72it/s]


Epoch 8/10:
Train Loss: 1.2303 | Val Loss: 1.2796


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.73it/s]


Epoch 9/10:
Train Loss: 1.1736 | Val Loss: 1.2154


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.72it/s]
[I 2025-05-10 17:45:02,036] Trial 50 finished with value: 1.13792670151544 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.17764734653889178, 'learning_rate': 0.0001607184626416104, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.1140 | Val Loss: 1.1379


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 1/10:
Train Loss: 2.2939 | Val Loss: 2.0420


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.87it/s]


Epoch 2/10:
Train Loss: 1.8952 | Val Loss: 1.7588


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 3/10:
Train Loss: 1.6945 | Val Loss: 1.6469


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 4/10:
Train Loss: 1.5572 | Val Loss: 1.5205


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 5/10:
Train Loss: 1.4449 | Val Loss: 1.4207


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 6/10:
Train Loss: 1.3487 | Val Loss: 1.3482


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 7/10:
Train Loss: 1.2624 | Val Loss: 1.2495


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 8/10:
Train Loss: 1.1839 | Val Loss: 1.2032


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 9/10:
Train Loss: 1.1151 | Val Loss: 1.1548


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]
[I 2025-05-10 18:01:21,811] Trial 51 finished with value: 1.1257018532071794 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.15968262950472215, 'learning_rate': 0.00012313868608917897, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0540 | Val Loss: 1.1257


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.70it/s]


Epoch 1/10:
Train Loss: 2.2992 | Val Loss: 2.0314


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.70it/s]


Epoch 2/10:
Train Loss: 1.8997 | Val Loss: 1.8162


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.71it/s]


Epoch 3/10:
Train Loss: 1.7186 | Val Loss: 1.7185


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.68it/s]


Epoch 4/10:
Train Loss: 1.5733 | Val Loss: 1.5287


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.71it/s]


Epoch 5/10:
Train Loss: 1.4560 | Val Loss: 1.4025


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.71it/s]


Epoch 6/10:
Train Loss: 1.3634 | Val Loss: 1.3421


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.70it/s]


Epoch 7/10:
Train Loss: 1.2844 | Val Loss: 1.2831


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.70it/s]


Epoch 8/10:
Train Loss: 1.2151 | Val Loss: 1.2110


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.71it/s]


Epoch 9/10:
Train Loss: 1.1554 | Val Loss: 1.1685


Training: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.70it/s]
[I 2025-05-10 18:17:49,826] Trial 52 finished with value: 1.121112673055558 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1923368891483644, 'learning_rate': 0.00010076975330308724, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0993 | Val Loss: 1.1211


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 1/10:
Train Loss: 2.2860 | Val Loss: 1.9593


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 2/10:
Train Loss: 1.8306 | Val Loss: 1.6943


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 3/10:
Train Loss: 1.5935 | Val Loss: 1.5121


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 4/10:
Train Loss: 1.4117 | Val Loss: 1.3463


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 5/10:
Train Loss: 1.2801 | Val Loss: 1.2581


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 6/10:
Train Loss: 1.1770 | Val Loss: 1.1848


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 7/10:
Train Loss: 1.0919 | Val Loss: 1.1288


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 8/10:
Train Loss: 1.0203 | Val Loss: 1.0853


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 9/10:
Train Loss: 0.9553 | Val Loss: 1.0596


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.89it/s]
[I 2025-05-10 18:34:09,350] Trial 53 finished with value: 1.031387982860444 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1322012996911261, 'learning_rate': 0.0001296005142896522, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.8935 | Val Loss: 1.0314


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 1/10:
Train Loss: 2.3106 | Val Loss: 1.9622


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 2/10:
Train Loss: 1.8318 | Val Loss: 1.7132


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 3/10:
Train Loss: 1.6371 | Val Loss: 1.5642


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 4/10:
Train Loss: 1.4950 | Val Loss: 1.5227


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 5/10:
Train Loss: 1.3843 | Val Loss: 1.3737


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 6/10:
Train Loss: 1.2949 | Val Loss: 1.3261


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 7/10:
Train Loss: 1.2123 | Val Loss: 1.2516


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 8/10:
Train Loss: 1.1414 | Val Loss: 1.1920


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 9/10:
Train Loss: 1.0736 | Val Loss: 1.1485


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]
[I 2025-05-10 18:50:28,247] Trial 54 finished with value: 1.1145224769910176 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1368567290854689, 'learning_rate': 0.00015929487968017949, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0105 | Val Loss: 1.1145


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 1/10:
Train Loss: 2.2722 | Val Loss: 1.9926


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 2/10:
Train Loss: 1.8276 | Val Loss: 1.7246


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 3/10:
Train Loss: 1.5834 | Val Loss: 1.4634


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 4/10:
Train Loss: 1.3939 | Val Loss: 1.3250


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 5/10:
Train Loss: 1.2580 | Val Loss: 1.2344


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 6/10:
Train Loss: 1.1552 | Val Loss: 1.1484


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 7/10:
Train Loss: 1.0717 | Val Loss: 1.1024


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 8/10:
Train Loss: 0.9970 | Val Loss: 1.0645


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 9/10:
Train Loss: 0.9301 | Val Loss: 1.0444


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.88it/s]
[I 2025-05-10 19:06:47,848] Trial 55 finished with value: 1.01893036706107 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.117426494295955, 'learning_rate': 0.00011802376359141069, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.8669 | Val Loss: 1.0189


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 1/10:
Train Loss: 2.2726 | Val Loss: 1.9224


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 2/10:
Train Loss: 1.8137 | Val Loss: 1.7098


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 3/10:
Train Loss: 1.6439 | Val Loss: 1.5712


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 4/10:
Train Loss: 1.5041 | Val Loss: 1.4822


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 5/10:
Train Loss: 1.3872 | Val Loss: 1.4121


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 6/10:
Train Loss: 1.2950 | Val Loss: 1.3101


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 7/10:
Train Loss: 1.2099 | Val Loss: 1.2755


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 8/10:
Train Loss: 1.1309 | Val Loss: 1.1737


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 9/10:
Train Loss: 1.0596 | Val Loss: 1.1605


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]
[I 2025-05-10 19:23:06,574] Trial 56 finished with value: 1.123630009946369 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.11930194560272783, 'learning_rate': 0.0001648503282831905, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9974 | Val Loss: 1.1236


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.98it/s]


Epoch 1/10:
Train Loss: 2.3983 | Val Loss: 1.9647


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.95it/s]


Epoch 2/10:
Train Loss: 1.8327 | Val Loss: 1.7099


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 3/10:
Train Loss: 1.6517 | Val Loss: 1.5725


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.94it/s]


Epoch 4/10:
Train Loss: 1.5266 | Val Loss: 1.5003


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 5/10:
Train Loss: 1.4378 | Val Loss: 1.4227


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 6/10:
Train Loss: 1.3598 | Val Loss: 1.3750


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 7/10:
Train Loss: 1.2763 | Val Loss: 1.3167


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.93it/s]


Epoch 8/10:
Train Loss: 1.2005 | Val Loss: 1.2637


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]


Epoch 9/10:
Train Loss: 1.1390 | Val Loss: 1.2100


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.91it/s]
[I 2025-05-10 19:39:25,286] Trial 57 finished with value: 1.2099701554056197 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1353542203602963, 'learning_rate': 0.000220508544942066, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0795 | Val Loss: 1.2100


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 1/10:
Train Loss: 2.2859 | Val Loss: 2.0556


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 2/10:
Train Loss: 1.8683 | Val Loss: 1.7769


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.89it/s]


Epoch 3/10:
Train Loss: 1.6759 | Val Loss: 1.6306


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 4/10:
Train Loss: 1.5319 | Val Loss: 1.4874


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.88it/s]


Epoch 5/10:
Train Loss: 1.4181 | Val Loss: 1.4009


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 6/10:
Train Loss: 1.3189 | Val Loss: 1.3243


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


Epoch 7/10:
Train Loss: 1.2368 | Val Loss: 1.2303


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 8/10:
Train Loss: 1.1664 | Val Loss: 1.1960


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.90it/s]


Epoch 9/10:
Train Loss: 1.1056 | Val Loss: 1.1368


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.87it/s]
[I 2025-05-10 19:55:45,256] Trial 58 finished with value: 1.107751420566014 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.17140636104321944, 'learning_rate': 0.00011019502058803529, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0485 | Val Loss: 1.1078


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.29it/s]


Epoch 1/10:
Train Loss: 3.0352 | Val Loss: 2.9857


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.27it/s]


Epoch 2/10:
Train Loss: 2.9811 | Val Loss: 2.9781


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.27it/s]


Epoch 3/10:
Train Loss: 2.9791 | Val Loss: 2.9764


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.27it/s]


Epoch 4/10:
Train Loss: 2.9784 | Val Loss: 2.9751


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.27it/s]


Epoch 5/10:
Train Loss: 2.9783 | Val Loss: 2.9746


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.28it/s]


Epoch 6/10:
Train Loss: 2.9779 | Val Loss: 2.9782


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.29it/s]


Epoch 7/10:
Train Loss: 2.9783 | Val Loss: 2.9766


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.28it/s]
[I 2025-05-10 20:08:43,817] Trial 59 finished with value: 2.974572874250866 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.20154100491493787, 'learning_rate': 0.00462665122447288, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 8/10:
Train Loss: 2.9770 | Val Loss: 2.9802
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.84it/s]


Epoch 1/10:
Train Loss: 2.2361 | Val Loss: 1.9129


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 2/10:
Train Loss: 1.7919 | Val Loss: 1.6603


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 3/10:
Train Loss: 1.5953 | Val Loss: 1.5114


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 4/10:
Train Loss: 1.4510 | Val Loss: 1.4128


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 5/10:
Train Loss: 1.3416 | Val Loss: 1.3195


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 6/10:
Train Loss: 1.2393 | Val Loss: 1.2333


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 7/10:
Train Loss: 1.1472 | Val Loss: 1.1655


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 8/10:
Train Loss: 1.0667 | Val Loss: 1.1044


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 9/10:
Train Loss: 0.9926 | Val Loss: 1.0761


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]
[I 2025-05-10 20:25:05,616] Trial 60 finished with value: 1.017797542942895 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11816056312636863, 'learning_rate': 0.0001446832569709606, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9273 | Val Loss: 1.0178


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 1/10:
Train Loss: 2.2222 | Val Loss: 1.9087


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 2/10:
Train Loss: 1.7737 | Val Loss: 1.6475


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 3/10:
Train Loss: 1.5805 | Val Loss: 1.5765


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 4/10:
Train Loss: 1.4442 | Val Loss: 1.4534


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 5/10:
Train Loss: 1.3292 | Val Loss: 1.3592


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 6/10:
Train Loss: 1.2292 | Val Loss: 1.2889


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 7/10:
Train Loss: 1.1429 | Val Loss: 1.1995


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 8/10:
Train Loss: 1.0646 | Val Loss: 1.1149


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 9/10:
Train Loss: 0.9872 | Val Loss: 1.0726


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]
[I 2025-05-10 20:41:28,038] Trial 61 finished with value: 1.053096936808692 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11571052093479009, 'learning_rate': 0.0001458873792045578, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9199 | Val Loss: 1.0531


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 1/10:
Train Loss: 2.2145 | Val Loss: 1.8848


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 2/10:
Train Loss: 1.7774 | Val Loss: 1.6476


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 3/10:
Train Loss: 1.5819 | Val Loss: 1.5310


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 4/10:
Train Loss: 1.4358 | Val Loss: 1.4027


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 5/10:
Train Loss: 1.3186 | Val Loss: 1.3077


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 6/10:
Train Loss: 1.2198 | Val Loss: 1.2511


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 7/10:
Train Loss: 1.1357 | Val Loss: 1.2341


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 8/10:
Train Loss: 1.0598 | Val Loss: 1.1179


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 9/10:
Train Loss: 0.9894 | Val Loss: 1.0706


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]
[I 2025-05-10 20:57:50,843] Trial 62 finished with value: 1.0427105237567236 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11448643197200474, 'learning_rate': 0.00014268978513781074, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9196 | Val Loss: 1.0427


Training: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 1/10:
Train Loss: 2.2147 | Val Loss: 1.9237


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 2/10:
Train Loss: 1.7738 | Val Loss: 1.6692


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 3/10:
Train Loss: 1.5756 | Val Loss: 1.5112


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 4/10:
Train Loss: 1.4313 | Val Loss: 1.4004


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 5/10:
Train Loss: 1.3140 | Val Loss: 1.3086


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 6/10:
Train Loss: 1.2135 | Val Loss: 1.2382


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 7/10:
Train Loss: 1.1269 | Val Loss: 1.1530


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 8/10:
Train Loss: 1.0468 | Val Loss: 1.1169


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 9/10:
Train Loss: 0.9769 | Val Loss: 1.0592


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]
[I 2025-05-10 21:14:13,879] Trial 63 finished with value: 1.0349139486040388 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11363569778270065, 'learning_rate': 0.0001484511148348393, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9085 | Val Loss: 1.0349


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 1/10:
Train Loss: 2.2271 | Val Loss: 1.9063


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 2/10:
Train Loss: 1.7745 | Val Loss: 1.6296


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 3/10:
Train Loss: 1.5752 | Val Loss: 1.5580


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 4/10:
Train Loss: 1.4344 | Val Loss: 1.4222


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 5/10:
Train Loss: 1.3207 | Val Loss: 1.2858


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 6/10:
Train Loss: 1.2201 | Val Loss: 1.2357


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 7/10:
Train Loss: 1.1327 | Val Loss: 1.1945


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 8/10:
Train Loss: 1.0539 | Val Loss: 1.1125


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 9/10:
Train Loss: 0.9845 | Val Loss: 1.0932


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]
[I 2025-05-10 21:30:36,558] Trial 64 finished with value: 1.0195828222093128 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11424425033529752, 'learning_rate': 0.00014853102480080058, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9170 | Val Loss: 1.0196


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.60it/s]


Epoch 1/10:
Train Loss: 2.2293 | Val Loss: 1.8993


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 2/10:
Train Loss: 1.7890 | Val Loss: 1.6873


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 3/10:
Train Loss: 1.5982 | Val Loss: 1.5349


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 4/10:
Train Loss: 1.4602 | Val Loss: 1.4304


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 5/10:
Train Loss: 1.3450 | Val Loss: 1.3311


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.66it/s]


Epoch 6/10:
Train Loss: 1.2478 | Val Loss: 1.2613


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 7/10:
Train Loss: 1.1585 | Val Loss: 1.1828


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 8/10:
Train Loss: 1.0768 | Val Loss: 1.1169


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 9/10:
Train Loss: 1.0034 | Val Loss: 1.0784


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]
[I 2025-05-10 21:46:59,319] Trial 65 finished with value: 1.034472158030858 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.12927795530358896, 'learning_rate': 0.000179026196274374, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9361 | Val Loss: 1.0345


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 1/10:
Train Loss: 2.2361 | Val Loss: 1.8844


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 2/10:
Train Loss: 1.7932 | Val Loss: 1.6795


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 3/10:
Train Loss: 1.6078 | Val Loss: 1.5589


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 4/10:
Train Loss: 1.4634 | Val Loss: 1.4307


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 5/10:
Train Loss: 1.3505 | Val Loss: 1.3430


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 6/10:
Train Loss: 1.2532 | Val Loss: 1.2749


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 7/10:
Train Loss: 1.1644 | Val Loss: 1.2234


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 8/10:
Train Loss: 1.0817 | Val Loss: 1.1544


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 9/10:
Train Loss: 1.0085 | Val Loss: 1.0935


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]
[I 2025-05-10 22:03:22,105] Trial 66 finished with value: 1.0629173517227173 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.12836570383236823, 'learning_rate': 0.00018274251106644205, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9403 | Val Loss: 1.0629


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 1/10:
Train Loss: 2.2655 | Val Loss: 1.8904


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 2/10:
Train Loss: 1.7522 | Val Loss: 1.6352


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 3/10:
Train Loss: 1.5559 | Val Loss: 1.5403


Training: 100%|██████████| 250/250 [02:02<00:00,  2.04it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 4/10:
Train Loss: 1.4146 | Val Loss: 1.4026


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.2932 | Val Loss: 1.3253


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.1757 | Val Loss: 1.2428


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 7/10:
Train Loss: 1.0724 | Val Loss: 1.1891


Training: 100%|██████████| 250/250 [02:02<00:00,  2.04it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 0.9775 | Val Loss: 1.1359


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.8933 | Val Loss: 1.0718


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]
[I 2025-05-10 22:25:43,657] Trial 67 finished with value: 1.0397872546362499 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10953636800919857, 'learning_rate': 0.00023598370302518954, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.8146 | Val Loss: 1.0398


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 1/10:
Train Loss: 2.2367 | Val Loss: 1.8810


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 2/10:
Train Loss: 1.7885 | Val Loss: 1.6926


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 3/10:
Train Loss: 1.6083 | Val Loss: 1.5342


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.67it/s]


Epoch 4/10:
Train Loss: 1.4686 | Val Loss: 1.4754


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 5/10:
Train Loss: 1.3588 | Val Loss: 1.3907


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 6/10:
Train Loss: 1.2654 | Val Loss: 1.2873


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 7/10:
Train Loss: 1.1822 | Val Loss: 1.2430


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 8/10:
Train Loss: 1.1131 | Val Loss: 1.1722


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 9/10:
Train Loss: 1.0409 | Val Loss: 1.1837


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]
[I 2025-05-10 22:42:06,307] Trial 68 finished with value: 1.055483895634848 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.13476234819268856, 'learning_rate': 0.0001853201446460407, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9749 | Val Loss: 1.0555


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.85it/s]


Epoch 1/10:
Train Loss: 2.2243 | Val Loss: 1.9599


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 2/10:
Train Loss: 1.7830 | Val Loss: 1.6658


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 3/10:
Train Loss: 1.5886 | Val Loss: 1.5518


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 4/10:
Train Loss: 1.4472 | Val Loss: 1.4376


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 5/10:
Train Loss: 1.3354 | Val Loss: 1.3345


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 6/10:
Train Loss: 1.2351 | Val Loss: 1.2802


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 7/10:
Train Loss: 1.1502 | Val Loss: 1.1841


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 8/10:
Train Loss: 1.0711 | Val Loss: 1.1202


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 9/10:
Train Loss: 0.9978 | Val Loss: 1.1048


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]
[I 2025-05-10 22:58:28,331] Trial 69 finished with value: 1.0337640226833404 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.1227821394455529, 'learning_rate': 0.00016213745709374084, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.9323 | Val Loss: 1.0338


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.99it/s]


Epoch 1/10:
Train Loss: 3.0302 | Val Loss: 2.9955


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.99it/s]


Epoch 2/10:
Train Loss: 2.9828 | Val Loss: 2.9773


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.98it/s]


Epoch 3/10:
Train Loss: 2.9789 | Val Loss: 2.9760


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.98it/s]


Epoch 4/10:
Train Loss: 2.9765 | Val Loss: 3.0221


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.97it/s]


Epoch 5/10:
Train Loss: 2.9738 | Val Loss: 3.0641


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  7.99it/s]
[I 2025-05-10 23:08:15,970] Trial 70 finished with value: 2.9760254753960504 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.12631586636666559, 'learning_rate': 0.0023323997585234096, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 6/10:
Train Loss: 2.9734 | Val Loss: 3.0563
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.76it/s]


Epoch 1/10:
Train Loss: 2.3429 | Val Loss: 1.9348


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.76it/s]


Epoch 2/10:
Train Loss: 1.8215 | Val Loss: 1.7214


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.77it/s]


Epoch 3/10:
Train Loss: 1.6481 | Val Loss: 1.5944


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.77it/s]


Epoch 4/10:
Train Loss: 1.5288 | Val Loss: 1.5303


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.77it/s]


Epoch 5/10:
Train Loss: 1.4320 | Val Loss: 1.4405


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.76it/s]


Epoch 6/10:
Train Loss: 1.3481 | Val Loss: 1.3807


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.79it/s]


Epoch 7/10:
Train Loss: 1.2762 | Val Loss: 1.3439


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.79it/s]


Epoch 8/10:
Train Loss: 1.2083 | Val Loss: 1.3016


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.78it/s]


Epoch 9/10:
Train Loss: 1.1442 | Val Loss: 1.2742


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.79it/s]
[I 2025-05-10 23:24:37,267] Trial 71 finished with value: 1.238491575869303 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11199151975628459, 'learning_rate': 0.000273926153831589, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0845 | Val Loss: 1.2385


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 1/10:
Train Loss: 2.2516 | Val Loss: 1.9012


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 2/10:
Train Loss: 1.7680 | Val Loss: 1.6474


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 3/10:
Train Loss: 1.5623 | Val Loss: 1.5120


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 4/10:
Train Loss: 1.4255 | Val Loss: 1.4064


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 5/10:
Train Loss: 1.3052 | Val Loss: 1.3134


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 6/10:
Train Loss: 1.2082 | Val Loss: 1.2447


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 7/10:
Train Loss: 1.1136 | Val Loss: 1.1525


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 8/10:
Train Loss: 1.0343 | Val Loss: 1.1332


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 9/10:
Train Loss: 0.9576 | Val Loss: 1.0550


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]
[I 2025-05-10 23:41:00,050] Trial 72 finished with value: 1.025072659764971 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10120279662488302, 'learning_rate': 0.0001616305398789018, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.8858 | Val Loss: 1.0251


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.66it/s]


Epoch 1/10:
Train Loss: 2.2747 | Val Loss: 1.9145


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 2/10:
Train Loss: 1.7969 | Val Loss: 1.6868


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.66it/s]


Epoch 3/10:
Train Loss: 1.6281 | Val Loss: 1.5839


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.66it/s]


Epoch 4/10:
Train Loss: 1.5069 | Val Loss: 1.4878


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.67it/s]


Epoch 5/10:
Train Loss: 1.4093 | Val Loss: 1.4644


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 6/10:
Train Loss: 1.3247 | Val Loss: 1.3485


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 7/10:
Train Loss: 1.2310 | Val Loss: 1.2596


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.66it/s]


Epoch 8/10:
Train Loss: 1.1509 | Val Loss: 1.2421


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.66it/s]


Epoch 9/10:
Train Loss: 1.0819 | Val Loss: 1.1608


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.66it/s]
[I 2025-05-10 23:57:22,553] Trial 73 finished with value: 1.1435057322184246 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.1402685276647142, 'learning_rate': 0.00021334665832447436, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 1.0159 | Val Loss: 1.1435


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 1/10:
Train Loss: 2.2127 | Val Loss: 1.8874


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 2/10:
Train Loss: 1.7590 | Val Loss: 1.6811


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 3/10:
Train Loss: 1.5707 | Val Loss: 1.5231


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 4/10:
Train Loss: 1.4233 | Val Loss: 1.3940


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 5/10:
Train Loss: 1.3087 | Val Loss: 1.3055


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 6/10:
Train Loss: 1.2075 | Val Loss: 1.2286


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.64it/s]


Epoch 7/10:
Train Loss: 1.1147 | Val Loss: 1.1710


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


Epoch 8/10:
Train Loss: 1.0335 | Val Loss: 1.1488


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


Epoch 9/10:
Train Loss: 0.9531 | Val Loss: 1.0612


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.65it/s]
[I 2025-05-11 00:13:45,414] Trial 74 finished with value: 1.031551186054472 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10014222171338188, 'learning_rate': 0.00016959363335048995, 'batch_size': 32}. Best is trial 44 with value: 1.0111583121239194.


Epoch 10/10:
Train Loss: 0.8836 | Val Loss: 1.0316


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.81it/s]


Epoch 1/10:
Train Loss: 2.2345 | Val Loss: 1.9112


Training: 100%|██████████| 250/250 [01:29<00:00,  2.79it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.80it/s]


Epoch 2/10:
Train Loss: 1.7859 | Val Loss: 1.6401


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 3/10:
Train Loss: 1.5742 | Val Loss: 1.5057


Training: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 4/10:
Train Loss: 1.4154 | Val Loss: 1.3687


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 5/10:
Train Loss: 1.2885 | Val Loss: 1.2529


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.58it/s]


Epoch 6/10:
Train Loss: 1.1833 | Val Loss: 1.1735


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 7/10:
Train Loss: 1.0942 | Val Loss: 1.1268


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 8/10:
Train Loss: 1.0211 | Val Loss: 1.0676


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.59it/s]


Epoch 9/10:
Train Loss: 0.9505 | Val Loss: 1.0293


Training: 100%|██████████| 250/250 [01:30<00:00,  2.78it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.60it/s]


Epoch 10/10:
Train Loss: 0.8857 | Val Loss: 0.9889


[I 2025-05-11 00:30:07,984] Trial 75 finished with value: 0.9889051308707585 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10172991088923825, 'learning_rate': 0.00010750718938195616, 'batch_size': 32}. Best is trial 75 with value: 0.9889051308707585.


New best model found! Val Loss: 0.9889
Config: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10172991088923825, 'learning_rate': 0.00010750718938195616, 'batch_size': 32}


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 1/10:
Train Loss: 2.2561 | Val Loss: 1.9609


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8313 | Val Loss: 1.6794


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6038 | Val Loss: 1.5593


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 4/10:
Train Loss: 1.4454 | Val Loss: 1.4395


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3075 | Val Loss: 1.3108


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.1970 | Val Loss: 1.2015


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 7/10:
Train Loss: 1.0922 | Val Loss: 1.1253


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0064 | Val Loss: 1.0677


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 9/10:
Train Loss: 0.9287 | Val Loss: 1.0371


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]
[I 2025-05-11 00:52:33,769] Trial 76 finished with value: 1.0083668582023135 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10024713491817741, 'learning_rate': 0.00010463828154341484, 'batch_size': 32}. Best is trial 75 with value: 0.9889051308707585.


Epoch 10/10:
Train Loss: 0.8502 | Val Loss: 1.0084


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2501 | Val Loss: 1.9776


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8504 | Val Loss: 1.7235


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6189 | Val Loss: 1.5648


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 4/10:
Train Loss: 1.4524 | Val Loss: 1.4032


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 5/10:
Train Loss: 1.3218 | Val Loss: 1.2872


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 6/10:
Train Loss: 1.2080 | Val Loss: 1.2377


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.1165 | Val Loss: 1.1375


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0272 | Val Loss: 1.0764


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 9/10:
Train Loss: 0.9470 | Val Loss: 1.0342


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]
[I 2025-05-11 01:14:59,642] Trial 77 finished with value: 1.0093017305646623 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10717516212704956, 'learning_rate': 0.00010013126042674718, 'batch_size': 32}. Best is trial 75 with value: 0.9889051308707585.


Epoch 10/10:
Train Loss: 0.8721 | Val Loss: 1.0093


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2545 | Val Loss: 1.9678


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 2/10:
Train Loss: 1.8380 | Val Loss: 1.7052


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 3/10:
Train Loss: 1.6050 | Val Loss: 1.5452


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 4/10:
Train Loss: 1.4449 | Val Loss: 1.3919


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3097 | Val Loss: 1.3137


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.1955 | Val Loss: 1.2196


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.0971 | Val Loss: 1.1429


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 8/10:
Train Loss: 1.0061 | Val Loss: 1.1070


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 9/10:
Train Loss: 0.9228 | Val Loss: 1.0387


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]
[I 2025-05-11 01:37:25,427] Trial 78 finished with value: 0.9899048634937831 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10037547461365641, 'learning_rate': 0.00010758768594852277, 'batch_size': 32}. Best is trial 75 with value: 0.9889051308707585.


Epoch 10/10:
Train Loss: 0.8432 | Val Loss: 0.9899


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 1/10:
Train Loss: 2.5737 | Val Loss: 2.2377


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 2/10:
Train Loss: 2.1574 | Val Loss: 2.0595


Training: 100%|██████████| 250/250 [01:02<00:00,  3.98it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.28it/s]


Epoch 3/10:
Train Loss: 2.0403 | Val Loss: 1.9670


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 4/10:
Train Loss: 1.9495 | Val Loss: 1.8648


Training: 100%|██████████| 250/250 [01:03<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.29it/s]


Epoch 5/10:
Train Loss: 1.8613 | Val Loss: 1.8113


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.25it/s]


Epoch 6/10:
Train Loss: 1.7866 | Val Loss: 1.7229


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.25it/s]


Epoch 7/10:
Train Loss: 1.7266 | Val Loss: 1.6720


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 8/10:
Train Loss: 1.6739 | Val Loss: 1.6275


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 9/10:
Train Loss: 1.6230 | Val Loss: 1.5618


Training: 100%|██████████| 250/250 [01:03<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]
[I 2025-05-11 01:48:56,398] Trial 79 finished with value: 1.5618364167591883 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10704225112311543, 'learning_rate': 0.00010955862427922805, 'batch_size': 32}. Best is trial 75 with value: 0.9889051308707585.


Epoch 10/10:
Train Loss: 1.5776 | Val Loss: 1.5804


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.71it/s]


Epoch 1/10:
Train Loss: 3.0647 | Val Loss: 2.9783


Training: 100%|██████████| 250/250 [02:02<00:00,  2.04it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.72it/s]


Epoch 2/10:
Train Loss: 2.9822 | Val Loss: 2.9745


Training: 100%|██████████| 250/250 [02:02<00:00,  2.04it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.72it/s]


Epoch 3/10:
Train Loss: 2.9801 | Val Loss: 2.9828


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.72it/s]


Epoch 4/10:
Train Loss: 2.9778 | Val Loss: 3.0270


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.70it/s]
[I 2025-05-11 02:00:06,370] Trial 80 finished with value: 2.9745038388267395 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11906019694558616, 'learning_rate': 0.009858439051025932, 'batch_size': 32}. Best is trial 75 with value: 0.9889051308707585.


Epoch 5/10:
Train Loss: 2.9760 | Val Loss: 3.0823
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 1/10:
Train Loss: 2.2457 | Val Loss: 1.9587


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 2/10:
Train Loss: 1.8377 | Val Loss: 1.7126


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 3/10:
Train Loss: 1.6096 | Val Loss: 1.5533


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4466 | Val Loss: 1.3894


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 5/10:
Train Loss: 1.3127 | Val Loss: 1.3003


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 6/10:
Train Loss: 1.2009 | Val Loss: 1.2210


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 7/10:
Train Loss: 1.0995 | Val Loss: 1.1330


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0064 | Val Loss: 1.0839


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9230 | Val Loss: 1.0273


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 10/10:
Train Loss: 0.8459 | Val Loss: 0.9886


[I 2025-05-11 02:22:32,985] Trial 81 finished with value: 0.9885849157969157 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10133278431891972, 'learning_rate': 0.0001029793648832552, 'batch_size': 32}. Best is trial 81 with value: 0.9885849157969157.


New best model found! Val Loss: 0.9886
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10133278431891972, 'learning_rate': 0.0001029793648832552, 'batch_size': 32}


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 1/10:
Train Loss: 2.2540 | Val Loss: 1.9724


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 2/10:
Train Loss: 1.8436 | Val Loss: 1.7119


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 3/10:
Train Loss: 1.6217 | Val Loss: 1.5523


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4695 | Val Loss: 1.4070


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 5/10:
Train Loss: 1.3310 | Val Loss: 1.3033


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.2211 | Val Loss: 1.2111


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.1222 | Val Loss: 1.1591


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 8/10:
Train Loss: 1.0345 | Val Loss: 1.1012


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9504 | Val Loss: 1.0519


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]
[I 2025-05-11 02:44:58,408] Trial 82 finished with value: 1.0204869612814889 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.1066487787785011, 'learning_rate': 0.00010097962539964846, 'batch_size': 32}. Best is trial 81 with value: 0.9885849157969157.


Epoch 10/10:
Train Loss: 0.8766 | Val Loss: 1.0205


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2395 | Val Loss: 1.9488


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 2/10:
Train Loss: 1.8247 | Val Loss: 1.7129


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 3/10:
Train Loss: 1.6075 | Val Loss: 1.5206


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4456 | Val Loss: 1.3952


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3134 | Val Loss: 1.2922


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 6/10:
Train Loss: 1.1983 | Val Loss: 1.2118


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.0980 | Val Loss: 1.1396


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0077 | Val Loss: 1.0690


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 9/10:
Train Loss: 0.9225 | Val Loss: 1.0323


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]
[I 2025-05-11 03:07:23,567] Trial 83 finished with value: 0.9983736456386627 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10028891485290928, 'learning_rate': 0.0001082722045238341, 'batch_size': 32}. Best is trial 81 with value: 0.9885849157969157.


Epoch 10/10:
Train Loss: 0.8482 | Val Loss: 0.9984


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.75it/s]


Epoch 1/10:
Train Loss: 2.2923 | Val Loss: 1.9921


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.76it/s]


Epoch 2/10:
Train Loss: 1.8481 | Val Loss: 1.7319


Training: 100%|██████████| 250/250 [03:03<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.76it/s]


Epoch 3/10:
Train Loss: 1.6508 | Val Loss: 1.5837


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.75it/s]


Epoch 4/10:
Train Loss: 1.5108 | Val Loss: 1.4778


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.75it/s]


Epoch 5/10:
Train Loss: 1.3946 | Val Loss: 1.3940


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.76it/s]


Epoch 6/10:
Train Loss: 1.2923 | Val Loss: 1.3470


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.75it/s]


Epoch 7/10:
Train Loss: 1.2002 | Val Loss: 1.2759


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.76it/s]


Epoch 8/10:
Train Loss: 1.1137 | Val Loss: 1.2262


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.76it/s]


Epoch 9/10:
Train Loss: 1.0269 | Val Loss: 1.2080


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.75it/s]
[I 2025-05-11 03:40:41,584] Trial 84 finished with value: 1.1427406564591422 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.1089003797258948, 'learning_rate': 0.00011264818399705793, 'batch_size': 32}. Best is trial 81 with value: 0.9885849157969157.


Epoch 10/10:
Train Loss: 0.9379 | Val Loss: 1.1427


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2579 | Val Loss: 1.9554


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 2/10:
Train Loss: 1.8379 | Val Loss: 1.7124


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 3/10:
Train Loss: 1.6166 | Val Loss: 1.5728


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4592 | Val Loss: 1.4258


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3244 | Val Loss: 1.2982


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.2059 | Val Loss: 1.2272


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.1078 | Val Loss: 1.1335


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0158 | Val Loss: 1.0763


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9334 | Val Loss: 1.0360


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 10/10:
Train Loss: 0.8540 | Val Loss: 0.9850


[I 2025-05-11 04:03:07,055] Trial 85 finished with value: 0.9850206346738906 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.1018942343827998, 'learning_rate': 0.00011040693316443509, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


New best model found! Val Loss: 0.9850
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.1018942343827998, 'learning_rate': 0.00011040693316443509, 'batch_size': 32}


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2467 | Val Loss: 1.9957


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 2/10:
Train Loss: 1.8365 | Val Loss: 1.7012


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6086 | Val Loss: 1.5244


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4453 | Val Loss: 1.3965


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3126 | Val Loss: 1.2949


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 6/10:
Train Loss: 1.1994 | Val Loss: 1.1938


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 7/10:
Train Loss: 1.0975 | Val Loss: 1.1278


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0075 | Val Loss: 1.0922


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 9/10:
Train Loss: 0.9272 | Val Loss: 1.0341


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]
[I 2025-05-11 04:25:31,975] Trial 86 finished with value: 0.9984960631718711 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10058826150467526, 'learning_rate': 0.00010104101568522308, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.8505 | Val Loss: 0.9985


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2491 | Val Loss: 2.0343


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8421 | Val Loss: 1.6977


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 3/10:
Train Loss: 1.6138 | Val Loss: 1.5625


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4649 | Val Loss: 1.4119


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3340 | Val Loss: 1.3123


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 6/10:
Train Loss: 1.2215 | Val Loss: 1.2318


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 7/10:
Train Loss: 1.1174 | Val Loss: 1.1465


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0274 | Val Loss: 1.1027


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 9/10:
Train Loss: 0.9423 | Val Loss: 1.0491


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]
[I 2025-05-11 04:47:57,202] Trial 87 finished with value: 1.0110677187404935 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10086025888125509, 'learning_rate': 0.00010099182445663282, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.8666 | Val Loss: 1.0111


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2541 | Val Loss: 2.0246


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8366 | Val Loss: 1.7188


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6152 | Val Loss: 1.5446


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4583 | Val Loss: 1.3997


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3281 | Val Loss: 1.3032


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 6/10:
Train Loss: 1.2159 | Val Loss: 1.2344


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 7/10:
Train Loss: 1.1198 | Val Loss: 1.1595


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 8/10:
Train Loss: 1.0327 | Val Loss: 1.1041


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 9/10:
Train Loss: 0.9494 | Val Loss: 1.0637


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]
[I 2025-05-11 05:10:22,187] Trial 88 finished with value: 1.016906996568044 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10036782292901211, 'learning_rate': 0.00010313994866251683, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.8712 | Val Loss: 1.0169


Training: 100%|██████████| 250/250 [01:02<00:00,  3.98it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.24it/s]


Epoch 1/10:
Train Loss: 2.5402 | Val Loss: 2.1954


Training: 100%|██████████| 250/250 [01:02<00:00,  3.98it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.21it/s]


Epoch 2/10:
Train Loss: 2.1271 | Val Loss: 2.0175


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 3/10:
Train Loss: 2.0022 | Val Loss: 1.9561


Training: 100%|██████████| 250/250 [01:02<00:00,  3.98it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.24it/s]


Epoch 4/10:
Train Loss: 1.9045 | Val Loss: 1.8066


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 5/10:
Train Loss: 1.8129 | Val Loss: 1.7560


Training: 100%|██████████| 250/250 [01:02<00:00,  3.98it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.27it/s]


Epoch 6/10:
Train Loss: 1.7434 | Val Loss: 1.6844


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.28it/s]


Epoch 7/10:
Train Loss: 1.6829 | Val Loss: 1.6182


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.22it/s]


Epoch 8/10:
Train Loss: 1.6276 | Val Loss: 1.5820


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.23it/s]


Epoch 9/10:
Train Loss: 1.5799 | Val Loss: 1.5374


Training: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00, 10.24it/s]
[I 2025-05-11 05:21:52,936] Trial 89 finished with value: 1.4973983859258986 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10760164988649845, 'learning_rate': 0.0001291830458695414, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 1.5363 | Val Loss: 1.4974


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.3623 | Val Loss: 2.1241


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 2.0401 | Val Loss: 1.9609


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.8764 | Val Loss: 1.8872


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.7610 | Val Loss: 1.7877


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 5/10:
Train Loss: 1.6739 | Val Loss: 1.7899


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.5907 | Val Loss: 1.7104


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.5196 | Val Loss: 1.5830


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.4572 | Val Loss: 1.5952


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 1.4058 | Val Loss: 1.5012


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]
[I 2025-05-11 05:44:17,970] Trial 90 finished with value: 1.3828899065653484 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.35748623112521966, 'learning_rate': 0.00011201675102344373, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 1.3569 | Val Loss: 1.3829


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2478 | Val Loss: 1.9881


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 2/10:
Train Loss: 1.8495 | Val Loss: 1.7551


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 3/10:
Train Loss: 1.6279 | Val Loss: 1.5685


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4695 | Val Loss: 1.4629


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 5/10:
Train Loss: 1.3418 | Val Loss: 1.3456


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 6/10:
Train Loss: 1.2296 | Val Loss: 1.2370


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 7/10:
Train Loss: 1.1323 | Val Loss: 1.1614


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 8/10:
Train Loss: 1.0426 | Val Loss: 1.0929


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9600 | Val Loss: 1.0658


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]
[I 2025-05-11 06:06:42,958] Trial 91 finished with value: 1.024545425460452 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10079312320126374, 'learning_rate': 0.00010049332311301042, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.8860 | Val Loss: 1.0245


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2437 | Val Loss: 1.9287


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8185 | Val Loss: 1.7084


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6075 | Val Loss: 1.5493


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4531 | Val Loss: 1.3984


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3266 | Val Loss: 1.3503


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 6/10:
Train Loss: 1.2210 | Val Loss: 1.2633


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.1273 | Val Loss: 1.2117


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0367 | Val Loss: 1.1030


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9588 | Val Loss: 1.1045


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]
[I 2025-05-11 06:29:06,961] Trial 92 finished with value: 1.037025012667217 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.12362479859212677, 'learning_rate': 0.0001324146950120689, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.8828 | Val Loss: 1.0370


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2415 | Val Loss: 1.9801


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8298 | Val Loss: 1.7383


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6190 | Val Loss: 1.5415


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4646 | Val Loss: 1.4244


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3404 | Val Loss: 1.3363


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.58it/s]


Epoch 6/10:
Train Loss: 1.2358 | Val Loss: 1.2735


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.1357 | Val Loss: 1.1934


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0479 | Val Loss: 1.1168


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9636 | Val Loss: 1.0707


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]
[I 2025-05-11 06:51:31,008] Trial 93 finished with value: 1.0374438346378387 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10780854519803312, 'learning_rate': 0.00011801779973905256, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.8855 | Val Loss: 1.0374


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 1/10:
Train Loss: 2.2731 | Val Loss: 1.9738


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8612 | Val Loss: 1.7498


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6619 | Val Loss: 1.6338


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.5123 | Val Loss: 1.4674


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 5/10:
Train Loss: 1.4019 | Val Loss: 1.3701


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.2998 | Val Loss: 1.2942


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.2103 | Val Loss: 1.2305


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.1326 | Val Loss: 1.1787


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 1.0584 | Val Loss: 1.1174


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]
[I 2025-05-11 07:13:55,535] Trial 94 finished with value: 1.0841108361879985 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.14224983704926097, 'learning_rate': 0.00011247276994659905, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.9851 | Val Loss: 1.0841


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2542 | Val Loss: 1.9642


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8388 | Val Loss: 1.7145


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 3/10:
Train Loss: 1.6243 | Val Loss: 1.5288


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4692 | Val Loss: 1.4279


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 5/10:
Train Loss: 1.3503 | Val Loss: 1.3335


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 6/10:
Train Loss: 1.2467 | Val Loss: 1.3088


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 7/10:
Train Loss: 1.1506 | Val Loss: 1.2161


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0651 | Val Loss: 1.1893


Training: 100%|██████████| 250/250 [02:02<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9824 | Val Loss: 1.0810


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]
[I 2025-05-11 07:36:19,324] Trial 95 finished with value: 1.0389853015778556 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.12202106464227555, 'learning_rate': 0.00013642013789512706, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.9076 | Val Loss: 1.0390


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 1/10:
Train Loss: 2.2360 | Val Loss: 1.9359


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 2/10:
Train Loss: 1.8090 | Val Loss: 1.6964


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]


Epoch 3/10:
Train Loss: 1.5973 | Val Loss: 1.5665


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 4/10:
Train Loss: 1.4443 | Val Loss: 1.4109


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 5/10:
Train Loss: 1.3112 | Val Loss: 1.3024


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 6/10:
Train Loss: 1.1945 | Val Loss: 1.1998


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 7/10:
Train Loss: 1.0944 | Val Loss: 1.1612


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 8/10:
Train Loss: 1.0027 | Val Loss: 1.0820


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.60it/s]


Epoch 9/10:
Train Loss: 0.9243 | Val Loss: 1.0290


Training: 100%|██████████| 250/250 [02:03<00:00,  2.03it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.59it/s]
[I 2025-05-11 07:58:43,888] Trial 96 finished with value: 1.0120134959145197 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.10805216681253191, 'learning_rate': 0.000123098037503769, 'batch_size': 32}. Best is trial 85 with value: 0.9850206346738906.


Epoch 10/10:
Train Loss: 0.8479 | Val Loss: 1.0120


Training: 100%|██████████| 250/250 [03:02<00:00,  1.37it/s]
Evaluating: 100%|██████████| 63/63 [00:16<00:00,  3.75it/s]


Epoch 1/10:
Train Loss: 2.3072 | Val Loss: 1.9760


Training:  60%|██████    | 151/250 [01:50<01:12,  1.37it/s]