In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Preparation

def load_data(file_path):
    df = pd.read_excel(file_path)

    # Filter rows where 'Output' length is <=500
    df = df[df['output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)

    # Randomly select 250,000 rows (if available)
    if count_filtered > 10000:
        df = df.sample(n=10000, random_state=42)

    inputs = df['input'].tolist()
    outputs = df['output'].tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break
    return best_val_loss  # Return the best validation loss from this training run

# Global variables to track best model across all trials
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Hyperparameter Optimization with Optuna
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [128, 256, 512]),
        "num_heads": trial.suggest_categorical("num_heads", [2, 4, 8, 16]),
        "num_layers": trial.suggest_categorical("num_layers", [6, 8, 10, 12]),
        "d_ff": trial.suggest_categorical("d_ff", [256, 512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [128]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss for this configuration
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_vig_key_1.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('/content/Full_training_vigenere_one.xlsx')
    vocab = Vocabulary()
    max_length = 256  # Adjusted for longer sentences

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100)  # 20 trials or 1 hour

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('/content/drive/MyDrive/best_vig_key_1.pth'))

    # Evaluate on full datasets
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher:xibu", "what")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cuda


[I 2025-05-09 14:45:03,623] A new study created in memory with name: no-name-74add0c7-8c89-4e2e-936f-64365e594515
Training: 100%|██████████| 63/63 [00:42<00:00,  1.49it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.51it/s]


Epoch 1/10:
Train Loss: 3.1002 | Val Loss: 2.9798


Training: 100%|██████████| 63/63 [00:41<00:00,  1.51it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.52it/s]


Epoch 2/10:
Train Loss: 2.9896 | Val Loss: 2.9869


Training: 100%|██████████| 63/63 [00:41<00:00,  1.51it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.47it/s]


Epoch 3/10:
Train Loss: 2.9840 | Val Loss: 2.9770


Training: 100%|██████████| 63/63 [00:41<00:00,  1.50it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.46it/s]


Epoch 4/10:
Train Loss: 2.9796 | Val Loss: 2.9784


Training: 100%|██████████| 63/63 [00:41<00:00,  1.50it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.47it/s]


Epoch 5/10:
Train Loss: 2.9760 | Val Loss: 3.0357


Training: 100%|██████████| 63/63 [00:41<00:00,  1.50it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.51it/s]


Epoch 6/10:
Train Loss: 2.9731 | Val Loss: 3.1766
Early stopping triggered!


[I 2025-05-09 14:49:39,079] Trial 0 finished with value: 2.976990729570389 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.13409674473584066, 'learning_rate': 0.0019444283106789706, 'batch_size': 128}. Best is trial 0 with value: 2.976990729570389.


New best model found! Val Loss: 2.9770
Config: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.13409674473584066, 'learning_rate': 0.0019444283106789706, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:13<00:00,  4.51it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.32it/s]


Epoch 1/10:
Train Loss: 2.9631 | Val Loss: 2.3783


Training: 100%|██████████| 63/63 [00:13<00:00,  4.52it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.27it/s]


Epoch 2/10:
Train Loss: 2.3117 | Val Loss: 2.1917


Training: 100%|██████████| 63/63 [00:13<00:00,  4.52it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.33it/s]


Epoch 3/10:
Train Loss: 2.1755 | Val Loss: 2.0764


Training: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.26it/s]


Epoch 4/10:
Train Loss: 2.0862 | Val Loss: 1.9857


Training: 100%|██████████| 63/63 [00:13<00:00,  4.51it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.32it/s]


Epoch 5/10:
Train Loss: 2.0164 | Val Loss: 1.9779


Training: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.28it/s]


Epoch 6/10:
Train Loss: 1.9538 | Val Loss: 1.8451


Training: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.35it/s]


Epoch 7/10:
Train Loss: 1.8812 | Val Loss: 1.8137


Training: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.35it/s]


Epoch 8/10:
Train Loss: 1.8072 | Val Loss: 1.6728


Training: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.83it/s]


Epoch 9/10:
Train Loss: 1.7283 | Val Loss: 1.6239


Training: 100%|██████████| 63/63 [00:13<00:00,  4.51it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.33it/s]
[I 2025-05-09 14:52:12,908] Trial 1 finished with value: 1.6239424496889114 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.2895921460792136, 'learning_rate': 0.0004207319115056254, 'batch_size': 128}. Best is trial 1 with value: 1.6239424496889114.


Epoch 10/10:
Train Loss: 1.6662 | Val Loss: 1.6276
New best model found! Val Loss: 1.6239
Config: {'d_model': 128, 'num_heads': 2, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.2895921460792136, 'learning_rate': 0.0004207319115056254, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:11<00:00,  5.30it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.65it/s]


Epoch 1/10:
Train Loss: 2.7383 | Val Loss: 2.2860


Training: 100%|██████████| 63/63 [00:11<00:00,  5.37it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.85it/s]


Epoch 2/10:
Train Loss: 2.2137 | Val Loss: 2.0875


Training: 100%|██████████| 63/63 [00:11<00:00,  5.36it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.85it/s]


Epoch 3/10:
Train Loss: 2.0620 | Val Loss: 1.9670


Training: 100%|██████████| 63/63 [00:11<00:00,  5.36it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.73it/s]


Epoch 4/10:
Train Loss: 1.9556 | Val Loss: 1.8611


Training: 100%|██████████| 63/63 [00:11<00:00,  5.32it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.77it/s]


Epoch 5/10:
Train Loss: 1.8511 | Val Loss: 1.7554


Training: 100%|██████████| 63/63 [00:11<00:00,  5.35it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.85it/s]


Epoch 6/10:
Train Loss: 1.7753 | Val Loss: 1.7067


Training: 100%|██████████| 63/63 [00:11<00:00,  5.27it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.87it/s]


Epoch 7/10:
Train Loss: 1.7313 | Val Loss: 1.6664


Training: 100%|██████████| 63/63 [00:11<00:00,  5.36it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.85it/s]


Epoch 8/10:
Train Loss: 1.6778 | Val Loss: 1.6236


Training: 100%|██████████| 63/63 [00:11<00:00,  5.36it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.81it/s]


Epoch 9/10:
Train Loss: 1.6394 | Val Loss: 1.6000


Training: 100%|██████████| 63/63 [00:11<00:00,  5.34it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.60it/s]
[I 2025-05-09 14:54:22,753] Trial 2 finished with value: 1.5758815258741379 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.23699861617761311, 'learning_rate': 0.0010433207939383284, 'batch_size': 128}. Best is trial 2 with value: 1.5758815258741379.


Epoch 10/10:
Train Loss: 1.6048 | Val Loss: 1.5759
New best model found! Val Loss: 1.5759
Config: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.23699861617761311, 'learning_rate': 0.0010433207939383284, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:41<00:00,  1.52it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.49it/s]


Epoch 1/10:
Train Loss: 3.2106 | Val Loss: 2.9984


Training: 100%|██████████| 63/63 [00:41<00:00,  1.53it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.49it/s]


Epoch 2/10:
Train Loss: 2.9887 | Val Loss: 2.9849


Training: 100%|██████████| 63/63 [00:41<00:00,  1.52it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.50it/s]


Epoch 3/10:
Train Loss: 2.9797 | Val Loss: 2.9871


Training: 100%|██████████| 63/63 [00:41<00:00,  1.52it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.49it/s]


Epoch 4/10:
Train Loss: 2.9785 | Val Loss: 2.9808


Training: 100%|██████████| 63/63 [00:41<00:00,  1.52it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.49it/s]


Epoch 5/10:
Train Loss: 2.9776 | Val Loss: 2.9845


Training: 100%|██████████| 63/63 [00:41<00:00,  1.53it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.50it/s]


Epoch 6/10:
Train Loss: 2.9777 | Val Loss: 2.9822


Training: 100%|██████████| 63/63 [00:41<00:00,  1.52it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.49it/s]
[I 2025-05-09 14:59:37,408] Trial 3 finished with value: 2.980793222784996 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1842968883720874, 'learning_rate': 0.009612120728211737, 'batch_size': 128}. Best is trial 2 with value: 1.5758815258741379.


Epoch 7/10:
Train Loss: 2.9768 | Val Loss: 2.9886
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.50it/s]


Epoch 1/10:
Train Loss: 3.0984 | Val Loss: 2.9777


Training: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.51it/s]


Epoch 2/10:
Train Loss: 2.9895 | Val Loss: 2.9795


Training: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.51it/s]


Epoch 3/10:
Train Loss: 2.9859 | Val Loss: 2.9809


Training: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.51it/s]


Epoch 4/10:
Train Loss: 2.9822 | Val Loss: 2.9758


Training: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.48it/s]


Epoch 5/10:
Train Loss: 2.9622 | Val Loss: 4.3086


Training: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.51it/s]


Epoch 6/10:
Train Loss: 2.9317 | Val Loss: 3.8876


Training: 100%|██████████| 63/63 [00:53<00:00,  1.18it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.51it/s]
[I 2025-05-09 15:06:24,692] Trial 4 finished with value: 2.9758480489254 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.18136720471024553, 'learning_rate': 0.001008286123493498, 'batch_size': 128}. Best is trial 2 with value: 1.5758815258741379.


Epoch 7/10:
Train Loss: 2.9155 | Val Loss: 3.8941
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:12<00:00,  4.86it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.27it/s]


Epoch 1/10:
Train Loss: 3.2949 | Val Loss: 2.7086


Training: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.20it/s]


Epoch 2/10:
Train Loss: 2.6138 | Val Loss: 2.4275


Training: 100%|██████████| 63/63 [00:13<00:00,  4.84it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 3/10:
Train Loss: 2.4130 | Val Loss: 2.3228


Training: 100%|██████████| 63/63 [00:13<00:00,  4.81it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.31it/s]


Epoch 4/10:
Train Loss: 2.3082 | Val Loss: 2.2268


Training: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.31it/s]


Epoch 5/10:
Train Loss: 2.2404 | Val Loss: 2.1684


Training: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.25it/s]


Epoch 6/10:
Train Loss: 2.1940 | Val Loss: 2.1392


Training: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.20it/s]


Epoch 7/10:
Train Loss: 2.1526 | Val Loss: 2.0905


Training: 100%|██████████| 63/63 [00:12<00:00,  4.88it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.28it/s]


Epoch 8/10:
Train Loss: 2.1113 | Val Loss: 2.0566


Training: 100%|██████████| 63/63 [00:12<00:00,  4.87it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.29it/s]


Epoch 9/10:
Train Loss: 2.0753 | Val Loss: 2.0028


Training: 100%|██████████| 63/63 [00:12<00:00,  4.87it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 12.30it/s]
[I 2025-05-09 15:08:47,279] Trial 5 finished with value: 1.9614527598023415 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.35595892124167827, 'learning_rate': 0.00011640559002518998, 'batch_size': 128}. Best is trial 2 with value: 1.5758815258741379.


Epoch 10/10:
Train Loss: 2.0333 | Val Loss: 1.9615


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 1/10:
Train Loss: 2.9208 | Val Loss: 2.3631


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.14it/s]


Epoch 2/10:
Train Loss: 2.2681 | Val Loss: 2.1188


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 3/10:
Train Loss: 2.1017 | Val Loss: 2.0147


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 4/10:
Train Loss: 1.9993 | Val Loss: 1.9022


Training: 100%|██████████| 63/63 [00:59<00:00,  1.06it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 5/10:
Train Loss: 1.8863 | Val Loss: 1.7813


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 6/10:
Train Loss: 1.7760 | Val Loss: 1.7357


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.14it/s]


Epoch 7/10:
Train Loss: 1.6751 | Val Loss: 1.6320


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 8/10:
Train Loss: 1.5832 | Val Loss: 1.5209


Training: 100%|██████████| 63/63 [00:59<00:00,  1.06it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 9/10:
Train Loss: 1.4954 | Val Loss: 1.3962


Training: 100%|██████████| 63/63 [00:59<00:00,  1.06it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 10/10:
Train Loss: 1.4013 | Val Loss: 1.3210


[I 2025-05-09 15:19:36,587] Trial 6 finished with value: 1.3209729343652725 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.3524050620930872, 'learning_rate': 0.0001468465765154352, 'batch_size': 128}. Best is trial 6 with value: 1.3209729343652725.


New best model found! Val Loss: 1.3210
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.3524050620930872, 'learning_rate': 0.0001468465765154352, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:26<00:00,  2.41it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Epoch 1/10:
Train Loss: 3.0458 | Val Loss: 2.9820


Training: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Epoch 2/10:
Train Loss: 2.9808 | Val Loss: 2.9818


Training: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Epoch 3/10:
Train Loss: 2.9799 | Val Loss: 2.9824


Training: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.69it/s]


Epoch 4/10:
Train Loss: 2.9786 | Val Loss: 2.9802


Training: 100%|██████████| 63/63 [00:26<00:00,  2.41it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.69it/s]


Epoch 5/10:
Train Loss: 2.9782 | Val Loss: 2.9792


Training: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Epoch 6/10:
Train Loss: 2.9774 | Val Loss: 2.9755


Training: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Epoch 7/10:
Train Loss: 2.9767 | Val Loss: 2.9771


Training: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Epoch 8/10:
Train Loss: 2.9771 | Val Loss: 2.9787


Training: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]
[I 2025-05-09 15:23:52,753] Trial 7 finished with value: 2.9755232632160187 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.1608729968582275, 'learning_rate': 0.007571380321578926, 'batch_size': 128}. Best is trial 6 with value: 1.3209729343652725.


Epoch 9/10:
Train Loss: 2.9772 | Val Loss: 2.9770
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:45<00:00,  1.37it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.08it/s]


Epoch 1/10:
Train Loss: 3.0950 | Val Loss: 2.9977


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.08it/s]


Epoch 2/10:
Train Loss: 2.9865 | Val Loss: 3.0059


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.09it/s]


Epoch 3/10:
Train Loss: 2.9821 | Val Loss: 3.0003


Training: 100%|██████████| 63/63 [00:45<00:00,  1.37it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.04it/s]


Epoch 4/10:
Train Loss: 2.9802 | Val Loss: 2.9920


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.09it/s]


Epoch 5/10:
Train Loss: 2.9784 | Val Loss: 2.9916


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.10it/s]


Epoch 6/10:
Train Loss: 2.9780 | Val Loss: 2.9912


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.09it/s]


Epoch 7/10:
Train Loss: 2.9768 | Val Loss: 3.3588


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.10it/s]


Epoch 8/10:
Train Loss: 2.9749 | Val Loss: 2.9849


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.09it/s]


Epoch 9/10:
Train Loss: 2.9762 | Val Loss: 2.9812


Training: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.09it/s]
[I 2025-05-09 15:32:10,322] Trial 8 finished with value: 2.981249079108238 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.38622786526025776, 'learning_rate': 0.002986946680650425, 'batch_size': 128}. Best is trial 6 with value: 1.3209729343652725.


Epoch 10/10:
Train Loss: 2.9715 | Val Loss: 3.2275


Training: 100%|██████████| 63/63 [00:15<00:00,  3.94it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.13it/s]


Epoch 1/10:
Train Loss: 2.6005 | Val Loss: 2.2360


Training: 100%|██████████| 63/63 [00:15<00:00,  3.95it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.18it/s]


Epoch 2/10:
Train Loss: 2.1482 | Val Loss: 2.0267


Training: 100%|██████████| 63/63 [00:15<00:00,  3.94it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.20it/s]


Epoch 3/10:
Train Loss: 1.9471 | Val Loss: 1.7943


Training: 100%|██████████| 63/63 [00:16<00:00,  3.93it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.14it/s]


Epoch 4/10:
Train Loss: 1.7446 | Val Loss: 1.5772


Training: 100%|██████████| 63/63 [00:15<00:00,  3.95it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.19it/s]


Epoch 5/10:
Train Loss: 1.5578 | Val Loss: 1.3790


Training: 100%|██████████| 63/63 [00:16<00:00,  3.94it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.19it/s]


Epoch 6/10:
Train Loss: 1.3628 | Val Loss: 1.0888


Training: 100%|██████████| 63/63 [00:16<00:00,  3.90it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.16it/s]


Epoch 7/10:
Train Loss: 1.0694 | Val Loss: 0.6832


Training: 100%|██████████| 63/63 [00:16<00:00,  3.93it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.20it/s]


Epoch 8/10:
Train Loss: 0.8139 | Val Loss: 0.5327


Training: 100%|██████████| 63/63 [00:16<00:00,  3.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.23it/s]


Epoch 9/10:
Train Loss: 0.6560 | Val Loss: 0.4428


Training: 100%|██████████| 63/63 [00:15<00:00,  3.95it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.21it/s]
[I 2025-05-09 15:35:05,299] Trial 9 finished with value: 0.29426856245845556 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.22291967495980047, 'learning_rate': 0.000326051498136512, 'batch_size': 128}. Best is trial 9 with value: 0.29426856245845556.


Epoch 10/10:
Train Loss: 0.5291 | Val Loss: 0.2943
New best model found! Val Loss: 0.2943
Config: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.22291967495980047, 'learning_rate': 0.000326051498136512, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:24<00:00,  2.61it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.57it/s]


Epoch 1/10:
Train Loss: 2.6009 | Val Loss: 2.2651


Training: 100%|██████████| 63/63 [00:23<00:00,  2.64it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.58it/s]


Epoch 2/10:
Train Loss: 2.1701 | Val Loss: 2.0516


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.23it/s]


Epoch 3/10:
Train Loss: 2.0367 | Val Loss: 1.9661


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.58it/s]


Epoch 4/10:
Train Loss: 1.9285 | Val Loss: 1.8475


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.62it/s]


Epoch 5/10:
Train Loss: 1.8189 | Val Loss: 1.7909


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.59it/s]


Epoch 6/10:
Train Loss: 1.7147 | Val Loss: 1.5824


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.24it/s]


Epoch 7/10:
Train Loss: 1.5905 | Val Loss: 1.4800


Training: 100%|██████████| 63/63 [00:24<00:00,  2.61it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.56it/s]


Epoch 8/10:
Train Loss: 1.4688 | Val Loss: 1.3467


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.46it/s]


Epoch 9/10:
Train Loss: 1.3440 | Val Loss: 1.2342


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.61it/s]
[I 2025-05-09 15:39:26,551] Trial 10 finished with value: 1.0952891409397125 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.25198633513047064, 'learning_rate': 0.000327825065378556, 'batch_size': 128}. Best is trial 9 with value: 0.29426856245845556.


Epoch 10/10:
Train Loss: 1.2296 | Val Loss: 1.0953


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.61it/s]


Epoch 1/10:
Train Loss: 2.7134 | Val Loss: 2.2811


Training: 100%|██████████| 63/63 [00:23<00:00,  2.64it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.63it/s]


Epoch 2/10:
Train Loss: 2.1922 | Val Loss: 2.0758


Training: 100%|██████████| 63/63 [00:23<00:00,  2.64it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.62it/s]


Epoch 3/10:
Train Loss: 2.0294 | Val Loss: 1.9180


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.58it/s]


Epoch 4/10:
Train Loss: 1.8987 | Val Loss: 1.8058


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.47it/s]


Epoch 5/10:
Train Loss: 1.8080 | Val Loss: 1.7270


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.22it/s]


Epoch 6/10:
Train Loss: 1.7271 | Val Loss: 1.7033


Training: 100%|██████████| 63/63 [00:24<00:00,  2.62it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.47it/s]


Epoch 7/10:
Train Loss: 1.6666 | Val Loss: 1.6007


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.62it/s]


Epoch 8/10:
Train Loss: 1.6142 | Val Loss: 1.6029


Training: 100%|██████████| 63/63 [00:24<00:00,  2.62it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.39it/s]


Epoch 9/10:
Train Loss: 1.5684 | Val Loss: 1.5363


Training: 100%|██████████| 63/63 [00:23<00:00,  2.64it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.61it/s]
[I 2025-05-09 15:43:47,590] Trial 11 finished with value: 1.5092048346996307 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.2627352631828468, 'learning_rate': 0.00038437973421232095, 'batch_size': 128}. Best is trial 9 with value: 0.29426856245845556.


Epoch 10/10:
Train Loss: 1.5272 | Val Loss: 1.5092


Training: 100%|██████████| 63/63 [00:23<00:00,  2.64it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.25it/s]


Epoch 1/10:
Train Loss: 2.6589 | Val Loss: 2.2565


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.62it/s]


Epoch 2/10:
Train Loss: 2.1879 | Val Loss: 2.0615


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.61it/s]


Epoch 3/10:
Train Loss: 2.0295 | Val Loss: 1.9157


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.58it/s]


Epoch 4/10:
Train Loss: 1.8985 | Val Loss: 1.8297


Training: 100%|██████████| 63/63 [00:23<00:00,  2.64it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.58it/s]


Epoch 5/10:
Train Loss: 1.8259 | Val Loss: 1.7284


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.54it/s]


Epoch 6/10:
Train Loss: 1.7225 | Val Loss: 1.6436


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.61it/s]


Epoch 7/10:
Train Loss: 1.6092 | Val Loss: 1.5583


Training: 100%|██████████| 63/63 [00:23<00:00,  2.64it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.61it/s]


Epoch 8/10:
Train Loss: 1.5023 | Val Loss: 1.4397


Training: 100%|██████████| 63/63 [00:24<00:00,  2.61it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.63it/s]


Epoch 9/10:
Train Loss: 1.3910 | Val Loss: 1.2916


Training: 100%|██████████| 63/63 [00:23<00:00,  2.63it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.44it/s]
[I 2025-05-09 15:48:08,610] Trial 12 finished with value: 1.1548495516180992 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.24010938740381896, 'learning_rate': 0.0003161758053672783, 'batch_size': 128}. Best is trial 9 with value: 0.29426856245845556.


Epoch 10/10:
Train Loss: 1.2729 | Val Loss: 1.1548


Training: 100%|██████████| 63/63 [00:21<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.60it/s]


Epoch 1/10:
Train Loss: 2.7189 | Val Loss: 2.2791


Training: 100%|██████████| 63/63 [00:20<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.60it/s]


Epoch 2/10:
Train Loss: 2.2326 | Val Loss: 2.1185


Training: 100%|██████████| 63/63 [00:21<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.18it/s]


Epoch 3/10:
Train Loss: 2.1041 | Val Loss: 2.0937


Training: 100%|██████████| 63/63 [00:20<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.60it/s]


Epoch 4/10:
Train Loss: 1.9950 | Val Loss: 1.8931


Training: 100%|██████████| 63/63 [00:20<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.57it/s]


Epoch 5/10:
Train Loss: 1.8599 | Val Loss: 1.7736


Training: 100%|██████████| 63/63 [00:20<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.55it/s]


Epoch 6/10:
Train Loss: 1.7530 | Val Loss: 1.7339


Training: 100%|██████████| 63/63 [00:21<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.49it/s]


Epoch 7/10:
Train Loss: 1.6648 | Val Loss: 1.5877


Training: 100%|██████████| 63/63 [00:21<00:00,  2.99it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.50it/s]


Epoch 8/10:
Train Loss: 1.5645 | Val Loss: 1.4102


Training: 100%|██████████| 63/63 [00:20<00:00,  3.00it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.59it/s]


Epoch 9/10:
Train Loss: 1.4571 | Val Loss: 1.3048


Training: 100%|██████████| 63/63 [00:20<00:00,  3.01it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  8.60it/s]
[I 2025-05-09 15:51:57,617] Trial 13 finished with value: 1.1785119399428368 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.310468865555549, 'learning_rate': 0.0002329931292310335, 'batch_size': 128}. Best is trial 9 with value: 0.29426856245845556.


Epoch 10/10:
Train Loss: 1.3675 | Val Loss: 1.1785


Training: 100%|██████████| 63/63 [00:18<00:00,  3.44it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.49it/s]


Epoch 1/10:
Train Loss: 2.6253 | Val Loss: 2.2409


Training: 100%|██████████| 63/63 [00:18<00:00,  3.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.51it/s]


Epoch 2/10:
Train Loss: 2.1494 | Val Loss: 2.0106


Training: 100%|██████████| 63/63 [00:18<00:00,  3.45it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.75it/s]


Epoch 3/10:
Train Loss: 1.9754 | Val Loss: 1.8739


Training: 100%|██████████| 63/63 [00:18<00:00,  3.44it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.48it/s]


Epoch 4/10:
Train Loss: 1.8405 | Val Loss: 1.7558


Training: 100%|██████████| 63/63 [00:18<00:00,  3.45it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.75it/s]


Epoch 5/10:
Train Loss: 1.7569 | Val Loss: 1.7289


Training: 100%|██████████| 63/63 [00:18<00:00,  3.43it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.75it/s]


Epoch 6/10:
Train Loss: 1.6857 | Val Loss: 1.6173


Training: 100%|██████████| 63/63 [00:18<00:00,  3.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.73it/s]


Epoch 7/10:
Train Loss: 1.5996 | Val Loss: 1.5376


Training: 100%|██████████| 63/63 [00:18<00:00,  3.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.73it/s]


Epoch 8/10:
Train Loss: 1.4740 | Val Loss: 1.4001


Training: 100%|██████████| 63/63 [00:18<00:00,  3.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.74it/s]


Epoch 9/10:
Train Loss: 1.3356 | Val Loss: 1.2094


Training: 100%|██████████| 63/63 [00:18<00:00,  3.44it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.72it/s]
[I 2025-05-09 15:55:17,101] Trial 14 finished with value: 1.0278542079031467 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.22033505953943017, 'learning_rate': 0.0005108273532519829, 'batch_size': 128}. Best is trial 9 with value: 0.29426856245845556.


Epoch 10/10:
Train Loss: 1.1781 | Val Loss: 1.0279


Training: 100%|██████████| 63/63 [00:15<00:00,  4.16it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.85it/s]


Epoch 1/10:
Train Loss: 2.4974 | Val Loss: 2.1319


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.74it/s]


Epoch 2/10:
Train Loss: 1.9499 | Val Loss: 1.7960


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.83it/s]


Epoch 3/10:
Train Loss: 1.5723 | Val Loss: 1.2456


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.84it/s]


Epoch 4/10:
Train Loss: 0.9989 | Val Loss: 0.6136


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.70it/s]


Epoch 5/10:
Train Loss: 0.6176 | Val Loss: 0.4542


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.83it/s]


Epoch 6/10:
Train Loss: 0.4377 | Val Loss: 0.2963


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.84it/s]


Epoch 7/10:
Train Loss: 0.3131 | Val Loss: 0.2298


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.82it/s]


Epoch 8/10:
Train Loss: 0.2362 | Val Loss: 0.1882


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.81it/s]


Epoch 9/10:
Train Loss: 0.1842 | Val Loss: 0.1192


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.61it/s]
[I 2025-05-09 15:58:01,845] Trial 15 finished with value: 0.11918468028306961 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.10421996274030854, 'learning_rate': 0.0006851815679212533, 'batch_size': 128}. Best is trial 15 with value: 0.11918468028306961.


Epoch 10/10:
Train Loss: 0.1620 | Val Loss: 0.1330
New best model found! Val Loss: 0.1192
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.10421996274030854, 'learning_rate': 0.0006851815679212533, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.65it/s]


Epoch 1/10:
Train Loss: 2.6371 | Val Loss: 2.1878


Training: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.64it/s]


Epoch 2/10:
Train Loss: 2.0620 | Val Loss: 1.9299


Training: 100%|██████████| 63/63 [00:12<00:00,  4.88it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.67it/s]


Epoch 3/10:
Train Loss: 1.7497 | Val Loss: 1.5278


Training: 100%|██████████| 63/63 [00:12<00:00,  4.86it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.57it/s]


Epoch 4/10:
Train Loss: 1.3065 | Val Loss: 0.7929


Training: 100%|██████████| 63/63 [00:12<00:00,  4.85it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.57it/s]


Epoch 5/10:
Train Loss: 0.7309 | Val Loss: 0.4716


Training: 100%|██████████| 63/63 [00:12<00:00,  4.88it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.60it/s]


Epoch 6/10:
Train Loss: 0.4903 | Val Loss: 0.2953


Training: 100%|██████████| 63/63 [00:12<00:00,  4.88it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.67it/s]


Epoch 7/10:
Train Loss: 0.3611 | Val Loss: 0.2170


Training: 100%|██████████| 63/63 [00:12<00:00,  4.88it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.60it/s]


Epoch 8/10:
Train Loss: 0.2825 | Val Loss: 0.1835


Training: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.08it/s]


Epoch 9/10:
Train Loss: 0.2176 | Val Loss: 0.1273


Training: 100%|██████████| 63/63 [00:12<00:00,  4.87it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 13.23it/s]
[I 2025-05-09 16:00:23,025] Trial 16 finished with value: 0.12732498347759247 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.10667832616071012, 'learning_rate': 0.0007259614701715942, 'batch_size': 128}. Best is trial 15 with value: 0.11918468028306961.


Epoch 10/10:
Train Loss: 0.1862 | Val Loss: 0.1426


Training: 100%|██████████| 63/63 [00:28<00:00,  2.22it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.46it/s]


Epoch 1/10:
Train Loss: 3.0637 | Val Loss: 2.9810


Training: 100%|██████████| 63/63 [00:28<00:00,  2.22it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.48it/s]


Epoch 2/10:
Train Loss: 2.9840 | Val Loss: 2.9764


Training: 100%|██████████| 63/63 [00:28<00:00,  2.23it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.46it/s]


Epoch 3/10:
Train Loss: 2.9814 | Val Loss: 2.9755


Training: 100%|██████████| 63/63 [00:28<00:00,  2.23it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.46it/s]


Epoch 4/10:
Train Loss: 2.9797 | Val Loss: 2.9762


Training: 100%|██████████| 63/63 [00:28<00:00,  2.23it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.47it/s]


Epoch 5/10:
Train Loss: 2.9723 | Val Loss: 3.0435


Training: 100%|██████████| 63/63 [00:28<00:00,  2.23it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.43it/s]
[I 2025-05-09 16:03:27,868] Trial 17 finished with value: 2.9755352437496185 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1010969208386506, 'learning_rate': 0.0019000771165884152, 'batch_size': 128}. Best is trial 15 with value: 0.11918468028306961.


Epoch 6/10:
Train Loss: 2.9463 | Val Loss: 3.2746
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.83it/s]


Epoch 1/10:
Train Loss: 2.5714 | Val Loss: 2.1690


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.55it/s]


Epoch 2/10:
Train Loss: 2.0379 | Val Loss: 1.9126


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.65it/s]


Epoch 3/10:
Train Loss: 1.7224 | Val Loss: 1.5368


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.84it/s]


Epoch 4/10:
Train Loss: 1.3481 | Val Loss: 0.9566


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.78it/s]


Epoch 5/10:
Train Loss: 0.8044 | Val Loss: 0.5090


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.83it/s]


Epoch 6/10:
Train Loss: 0.5159 | Val Loss: 0.3244


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.74it/s]


Epoch 7/10:
Train Loss: 0.3583 | Val Loss: 0.2372


Training: 100%|██████████| 63/63 [00:15<00:00,  4.16it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.71it/s]


Epoch 8/10:
Train Loss: 0.2690 | Val Loss: 0.1587


Training: 100%|██████████| 63/63 [00:15<00:00,  4.16it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.80it/s]


Epoch 9/10:
Train Loss: 0.2056 | Val Loss: 0.1507


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.84it/s]
[I 2025-05-09 16:06:12,833] Trial 18 finished with value: 0.10892352042719722 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.10205061074117658, 'learning_rate': 0.000704969990251232, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.1832 | Val Loss: 0.1089
New best model found! Val Loss: 0.1089
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.10205061074117658, 'learning_rate': 0.000704969990251232, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.40it/s]


Epoch 1/10:
Train Loss: 3.0598 | Val Loss: 2.9778


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 2/10:
Train Loss: 2.9844 | Val Loss: 2.9793


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 3/10:
Train Loss: 2.9828 | Val Loss: 2.9771


Training: 100%|██████████| 63/63 [00:15<00:00,  4.15it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 4/10:
Train Loss: 2.9726 | Val Loss: 3.0397


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.94it/s]


Epoch 5/10:
Train Loss: 2.9472 | Val Loss: 3.5187


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]
[I 2025-05-09 16:07:51,528] Trial 19 finished with value: 2.9770925045013428 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.13253573598373022, 'learning_rate': 0.0016511269783723262, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 6/10:
Train Loss: 2.9347 | Val Loss: 3.6610
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:28<00:00,  2.18it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Epoch 1/10:
Train Loss: 3.0765 | Val Loss: 2.9794


Training: 100%|██████████| 63/63 [00:29<00:00,  2.16it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.43it/s]


Epoch 2/10:
Train Loss: 2.9837 | Val Loss: 2.9793


Training: 100%|██████████| 63/63 [00:28<00:00,  2.18it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]


Epoch 3/10:
Train Loss: 2.9810 | Val Loss: 2.9774


Training: 100%|██████████| 63/63 [00:28<00:00,  2.18it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.25it/s]


Epoch 4/10:
Train Loss: 2.9783 | Val Loss: 2.9770


Training: 100%|██████████| 63/63 [00:28<00:00,  2.18it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.50it/s]


Epoch 5/10:
Train Loss: 2.9772 | Val Loss: 2.9799


Training: 100%|██████████| 63/63 [00:28<00:00,  2.18it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.50it/s]


Epoch 6/10:
Train Loss: 2.9756 | Val Loss: 3.2602


Training: 100%|██████████| 63/63 [00:28<00:00,  2.17it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.48it/s]
[I 2025-05-09 16:11:31,737] Trial 20 finished with value: 2.977009207010269 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.14144076635302677, 'learning_rate': 0.003707158636640508, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 7/10:
Train Loss: 2.9702 | Val Loss: 3.4103
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 1/10:
Train Loss: 2.6377 | Val Loss: 2.2092


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.93it/s]


Epoch 2/10:
Train Loss: 2.0647 | Val Loss: 1.8784


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.86it/s]


Epoch 3/10:
Train Loss: 1.7161 | Val Loss: 1.4734


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.84it/s]


Epoch 4/10:
Train Loss: 1.2855 | Val Loss: 0.8626


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.80it/s]


Epoch 5/10:
Train Loss: 0.7466 | Val Loss: 0.5145


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]


Epoch 6/10:
Train Loss: 0.4980 | Val Loss: 0.3095


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.94it/s]


Epoch 7/10:
Train Loss: 0.3441 | Val Loss: 0.2493


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.66it/s]


Epoch 8/10:
Train Loss: 0.2585 | Val Loss: 0.1693


Training: 100%|██████████| 63/63 [00:15<00:00,  4.20it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.92it/s]


Epoch 9/10:
Train Loss: 0.2057 | Val Loss: 0.1475


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.89it/s]
[I 2025-05-09 16:14:15,812] Trial 21 finished with value: 0.12109877448529005 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.103433948769456, 'learning_rate': 0.0006582164508680134, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.1659 | Val Loss: 0.1211


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.92it/s]


Epoch 1/10:
Train Loss: 2.6412 | Val Loss: 2.2157


Training: 100%|██████████| 63/63 [00:15<00:00,  4.16it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.80it/s]


Epoch 2/10:
Train Loss: 2.0648 | Val Loss: 1.8952


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.93it/s]


Epoch 3/10:
Train Loss: 1.8232 | Val Loss: 1.6691


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.57it/s]


Epoch 4/10:
Train Loss: 1.5479 | Val Loss: 1.3554


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.95it/s]


Epoch 5/10:
Train Loss: 1.1693 | Val Loss: 0.8068


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.77it/s]


Epoch 6/10:
Train Loss: 0.7051 | Val Loss: 0.4628


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.92it/s]


Epoch 7/10:
Train Loss: 0.4628 | Val Loss: 0.2829


Training: 100%|██████████| 63/63 [00:15<00:00,  4.15it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.95it/s]


Epoch 8/10:
Train Loss: 0.3182 | Val Loss: 0.1993


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.79it/s]


Epoch 9/10:
Train Loss: 0.2455 | Val Loss: 0.2162


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]
[I 2025-05-09 16:17:00,081] Trial 22 finished with value: 0.12470369879156351 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.10045783325192552, 'learning_rate': 0.0009370790468253248, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.1973 | Val Loss: 0.1247


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.87it/s]


Epoch 1/10:
Train Loss: 2.6155 | Val Loss: 2.2018


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.53it/s]


Epoch 2/10:
Train Loss: 2.0382 | Val Loss: 1.8472


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.85it/s]


Epoch 3/10:
Train Loss: 1.7196 | Val Loss: 1.4656


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.85it/s]


Epoch 4/10:
Train Loss: 1.3271 | Val Loss: 0.8725


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.59it/s]


Epoch 5/10:
Train Loss: 0.8421 | Val Loss: 0.5213


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.96it/s]


Epoch 6/10:
Train Loss: 0.5952 | Val Loss: 0.3604


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.89it/s]


Epoch 7/10:
Train Loss: 0.4483 | Val Loss: 0.2880


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.93it/s]


Epoch 8/10:
Train Loss: 0.3483 | Val Loss: 0.1902


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 9/10:
Train Loss: 0.2832 | Val Loss: 0.1676


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.78it/s]
[I 2025-05-09 16:19:44,373] Trial 23 finished with value: 0.15026299795135856 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.17002923434584055, 'learning_rate': 0.0006649874617523108, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.2301 | Val Loss: 0.1503


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.89it/s]


Epoch 1/10:
Train Loss: 2.6962 | Val Loss: 2.2413


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]


Epoch 2/10:
Train Loss: 2.1550 | Val Loss: 2.0795


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.92it/s]


Epoch 3/10:
Train Loss: 1.9720 | Val Loss: 1.8352


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 4/10:
Train Loss: 1.8135 | Val Loss: 1.6781


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 5/10:
Train Loss: 1.6800 | Val Loss: 1.5503


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]


Epoch 6/10:
Train Loss: 1.5189 | Val Loss: 1.3163


Training: 100%|██████████| 63/63 [00:15<00:00,  4.20it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.94it/s]


Epoch 7/10:
Train Loss: 1.2974 | Val Loss: 0.9956


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.86it/s]


Epoch 8/10:
Train Loss: 1.0153 | Val Loss: 0.7196


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.92it/s]


Epoch 9/10:
Train Loss: 0.8050 | Val Loss: 0.5563


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]
[I 2025-05-09 16:22:28,389] Trial 24 finished with value: 0.4495334289968014 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1239012802152521, 'learning_rate': 0.00020036092739432367, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.6576 | Val Loss: 0.4495


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.82it/s]


Epoch 1/10:
Train Loss: 2.5739 | Val Loss: 2.1605


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.89it/s]


Epoch 2/10:
Train Loss: 1.9948 | Val Loss: 1.7871


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.87it/s]


Epoch 3/10:
Train Loss: 1.6764 | Val Loss: 1.4084


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]


Epoch 4/10:
Train Loss: 1.2071 | Val Loss: 0.7348


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]


Epoch 5/10:
Train Loss: 0.7737 | Val Loss: 0.5154


Training: 100%|██████████| 63/63 [00:15<00:00,  4.15it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.94it/s]


Epoch 6/10:
Train Loss: 0.5557 | Val Loss: 0.3386


Training: 100%|██████████| 63/63 [00:15<00:00,  4.20it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.92it/s]


Epoch 7/10:
Train Loss: 0.4216 | Val Loss: 0.2374


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]


Epoch 8/10:
Train Loss: 0.3261 | Val Loss: 0.1888


Training: 100%|██████████| 63/63 [00:15<00:00,  4.15it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.68it/s]


Epoch 9/10:
Train Loss: 0.2655 | Val Loss: 0.1626


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.72it/s]
[I 2025-05-09 16:25:12,802] Trial 25 finished with value: 0.14045393653213978 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.15659404444360153, 'learning_rate': 0.0006062491976805365, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.2199 | Val Loss: 0.1405


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.88it/s]


Epoch 1/10:
Train Loss: 3.0679 | Val Loss: 2.9796


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.81it/s]


Epoch 2/10:
Train Loss: 2.9865 | Val Loss: 2.9805


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.91it/s]


Epoch 3/10:
Train Loss: 2.9810 | Val Loss: 2.9994


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.95it/s]
[I 2025-05-09 16:26:18,542] Trial 26 finished with value: 2.9795985370874405 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2102991353973655, 'learning_rate': 0.0012827703620344538, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 4/10:
Train Loss: 2.9682 | Val Loss: 3.1713
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.92it/s]


Epoch 1/10:
Train Loss: 3.0845 | Val Loss: 2.9841


Training: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.66it/s]


Epoch 2/10:
Train Loss: 2.9853 | Val Loss: 2.9809


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.76it/s]


Epoch 3/10:
Train Loss: 2.9809 | Val Loss: 2.9777


Training: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.85it/s]


Epoch 4/10:
Train Loss: 2.9763 | Val Loss: 3.1738


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.90it/s]


Epoch 5/10:
Train Loss: 2.9725 | Val Loss: 3.0630


Training: 100%|██████████| 63/63 [00:15<00:00,  4.19it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.89it/s]
[I 2025-05-09 16:27:57,168] Trial 27 finished with value: 2.97770756483078 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.19857111074599992, 'learning_rate': 0.0028832126428403673, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 6/10:
Train Loss: 2.9519 | Val Loss: 3.2286
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:15<00:00,  4.08it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.40it/s]


Epoch 1/10:
Train Loss: 2.8133 | Val Loss: 2.3004


Training: 100%|██████████| 63/63 [00:15<00:00,  4.07it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.62it/s]


Epoch 2/10:
Train Loss: 2.1961 | Val Loss: 2.0976


Training: 100%|██████████| 63/63 [00:15<00:00,  4.07it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.60it/s]


Epoch 3/10:
Train Loss: 2.0297 | Val Loss: 1.9527


Training: 100%|██████████| 63/63 [00:15<00:00,  4.07it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.60it/s]


Epoch 4/10:
Train Loss: 1.8979 | Val Loss: 1.8144


Training: 100%|██████████| 63/63 [00:15<00:00,  4.08it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.56it/s]


Epoch 5/10:
Train Loss: 1.7747 | Val Loss: 1.6655


Training: 100%|██████████| 63/63 [00:15<00:00,  4.07it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.58it/s]


Epoch 6/10:
Train Loss: 1.6309 | Val Loss: 1.5417


Training: 100%|██████████| 63/63 [00:15<00:00,  4.07it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.50it/s]


Epoch 7/10:
Train Loss: 1.4816 | Val Loss: 1.3637


Training: 100%|██████████| 63/63 [00:15<00:00,  4.05it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.51it/s]


Epoch 8/10:
Train Loss: 1.3232 | Val Loss: 1.1819


Training: 100%|██████████| 63/63 [00:15<00:00,  4.08it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.49it/s]


Epoch 9/10:
Train Loss: 1.1802 | Val Loss: 1.0520


Training: 100%|██████████| 63/63 [00:15<00:00,  4.07it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 10.54it/s]
[I 2025-05-09 16:30:47,253] Trial 28 finished with value: 0.735079288482666 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.11900512163090303, 'learning_rate': 0.0007309458175723777, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 1.0137 | Val Loss: 0.7351


Training: 100%|██████████| 63/63 [00:57<00:00,  1.11it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.29it/s]


Epoch 1/10:
Train Loss: 3.0626 | Val Loss: 2.9829


Training: 100%|██████████| 63/63 [00:57<00:00,  1.10it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.28it/s]


Epoch 2/10:
Train Loss: 2.9849 | Val Loss: 2.9779


Training: 100%|██████████| 63/63 [00:57<00:00,  1.10it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.30it/s]


Epoch 3/10:
Train Loss: 2.9801 | Val Loss: 3.0494


Training: 100%|██████████| 63/63 [00:57<00:00,  1.10it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.30it/s]


Epoch 4/10:
Train Loss: 2.9605 | Val Loss: 3.2182


Training: 100%|██████████| 63/63 [00:57<00:00,  1.10it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.30it/s]
[I 2025-05-09 16:35:56,930] Trial 29 finished with value: 2.9778508245944977 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.14563472006196376, 'learning_rate': 0.0013814921771314098, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 5/10:
Train Loss: 2.9360 | Val Loss: 3.4620
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]


Epoch 1/10:
Train Loss: 2.5299 | Val Loss: 2.1032


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.86it/s]


Epoch 2/10:
Train Loss: 1.9626 | Val Loss: 1.7849


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 3/10:
Train Loss: 1.6803 | Val Loss: 1.4854


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 4/10:
Train Loss: 1.3238 | Val Loss: 0.9755


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 5/10:
Train Loss: 0.8193 | Val Loss: 0.5083


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.93it/s]


Epoch 6/10:
Train Loss: 0.5171 | Val Loss: 0.3403


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.87it/s]


Epoch 7/10:
Train Loss: 0.3570 | Val Loss: 0.2471


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]


Epoch 8/10:
Train Loss: 0.2669 | Val Loss: 0.1759


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 9/10:
Train Loss: 0.2115 | Val Loss: 0.1605


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]
[I 2025-05-09 16:41:30,106] Trial 30 finished with value: 0.11965888948179781 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1398207531009602, 'learning_rate': 0.00022071436054560005, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.1691 | Val Loss: 0.1197


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 1/10:
Train Loss: 2.5514 | Val Loss: 2.1304


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 2/10:
Train Loss: 1.9416 | Val Loss: 1.7482


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.88it/s]


Epoch 3/10:
Train Loss: 1.6032 | Val Loss: 1.3436


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 4/10:
Train Loss: 1.1084 | Val Loss: 0.6887


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 5/10:
Train Loss: 0.6157 | Val Loss: 0.3826


Training: 100%|██████████| 63/63 [00:30<00:00,  2.05it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.89it/s]


Epoch 6/10:
Train Loss: 0.3974 | Val Loss: 0.2591


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 7/10:
Train Loss: 0.2804 | Val Loss: 0.1832


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 8/10:
Train Loss: 0.2079 | Val Loss: 0.1422


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 9/10:
Train Loss: 0.1639 | Val Loss: 0.1154


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.91it/s]
[I 2025-05-09 16:47:02,964] Trial 31 finished with value: 0.11539581697434187 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11996730063347485, 'learning_rate': 0.00024455238395148707, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.1354 | Val Loss: 0.1163


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 1/10:
Train Loss: 2.5354 | Val Loss: 2.0907


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 2/10:
Train Loss: 1.9606 | Val Loss: 1.7863


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 3/10:
Train Loss: 1.6896 | Val Loss: 1.5017


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 4/10:
Train Loss: 1.3480 | Val Loss: 0.9811


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.90it/s]


Epoch 5/10:
Train Loss: 0.8241 | Val Loss: 0.5274


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 6/10:
Train Loss: 0.5153 | Val Loss: 0.3447


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 7/10:
Train Loss: 0.3599 | Val Loss: 0.2313


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.92it/s]


Epoch 8/10:
Train Loss: 0.2607 | Val Loss: 0.2066


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 9/10:
Train Loss: 0.2008 | Val Loss: 0.1262


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]
[I 2025-05-09 16:52:35,597] Trial 32 finished with value: 0.11504993634298444 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.12442653790896432, 'learning_rate': 0.00020840158832099176, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.1580 | Val Loss: 0.1150


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.82it/s]


Epoch 1/10:
Train Loss: 2.5553 | Val Loss: 2.1334


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 2/10:
Train Loss: 2.0229 | Val Loss: 1.8661


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.86it/s]


Epoch 3/10:
Train Loss: 1.7739 | Val Loss: 1.6208


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.82it/s]


Epoch 4/10:
Train Loss: 1.5484 | Val Loss: 1.3686


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.92it/s]


Epoch 5/10:
Train Loss: 1.2467 | Val Loss: 0.9291


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.87it/s]


Epoch 6/10:
Train Loss: 0.8082 | Val Loss: 0.5243


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 7/10:
Train Loss: 0.5424 | Val Loss: 0.3757


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 8/10:
Train Loss: 0.3962 | Val Loss: 0.2850


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.88it/s]


Epoch 9/10:
Train Loss: 0.2933 | Val Loss: 0.2021


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]
[I 2025-05-09 16:58:08,520] Trial 33 finished with value: 0.1830203072167933 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11993028092758523, 'learning_rate': 0.00015684182867415576, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.2327 | Val Loss: 0.1830


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 1/10:
Train Loss: 2.6055 | Val Loss: 2.1763


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 2/10:
Train Loss: 2.1111 | Val Loss: 2.0114


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.90it/s]


Epoch 3/10:
Train Loss: 1.9483 | Val Loss: 1.8250


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 4/10:
Train Loss: 1.7954 | Val Loss: 1.7245


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.89it/s]


Epoch 5/10:
Train Loss: 1.6572 | Val Loss: 1.5190


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.89it/s]


Epoch 6/10:
Train Loss: 1.5189 | Val Loss: 1.3746


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 7/10:
Train Loss: 1.3588 | Val Loss: 1.1783


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 8/10:
Train Loss: 1.1512 | Val Loss: 0.8630


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.85it/s]


Epoch 9/10:
Train Loss: 0.8543 | Val Loss: 0.6206


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]
[I 2025-05-09 17:03:41,284] Trial 34 finished with value: 0.4235410075634718 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.15493140820248202, 'learning_rate': 0.00010339832728587635, 'batch_size': 128}. Best is trial 18 with value: 0.10892352042719722.


Epoch 10/10:
Train Loss: 0.6458 | Val Loss: 0.4235


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.93it/s]


Epoch 1/10:
Train Loss: 2.7874 | Val Loss: 2.2297


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 2/10:
Train Loss: 2.0974 | Val Loss: 1.9037


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.93it/s]


Epoch 3/10:
Train Loss: 1.7493 | Val Loss: 1.5153


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 4/10:
Train Loss: 1.3464 | Val Loss: 0.9972


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 5/10:
Train Loss: 0.8093 | Val Loss: 0.4923


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 6/10:
Train Loss: 0.5033 | Val Loss: 0.3299


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]


Epoch 7/10:
Train Loss: 0.3484 | Val Loss: 0.2950


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 8/10:
Train Loss: 0.2489 | Val Loss: 0.1628


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 9/10:
Train Loss: 0.1848 | Val Loss: 0.1309


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 10/10:
Train Loss: 0.1563 | Val Loss: 0.1048


[I 2025-05-09 17:09:14,279] Trial 35 finished with value: 0.10481278039515018 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.17726286798252044, 'learning_rate': 0.00041870138046125414, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


New best model found! Val Loss: 0.1048
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.17726286798252044, 'learning_rate': 0.00041870138046125414, 'batch_size': 128}


Training: 100%|██████████| 63/63 [01:03<00:00,  1.02s/it]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  2.95it/s]


Epoch 1/10:
Train Loss: 3.0879 | Val Loss: 2.9817


Training: 100%|██████████| 63/63 [01:03<00:00,  1.02s/it]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  2.95it/s]


Epoch 2/10:
Train Loss: 2.9867 | Val Loss: 2.9787


Training: 100%|██████████| 63/63 [01:04<00:00,  1.02s/it]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  2.96it/s]


Epoch 3/10:
Train Loss: 2.9852 | Val Loss: 2.9799


Training: 100%|██████████| 63/63 [01:03<00:00,  1.02s/it]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  2.95it/s]


Epoch 4/10:
Train Loss: 2.9835 | Val Loss: 2.9805


Training: 100%|██████████| 63/63 [01:03<00:00,  1.02s/it]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  2.96it/s]
[I 2025-05-09 17:15:02,030] Trial 36 finished with value: 2.9787464141845703 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.1842704410484377, 'learning_rate': 0.00042062787148066955, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


Epoch 5/10:
Train Loss: 2.9722 | Val Loss: 3.0953
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.74it/s]


Epoch 1/10:
Train Loss: 3.0763 | Val Loss: 2.9618


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.72it/s]


Epoch 2/10:
Train Loss: 2.5494 | Val Loss: 2.2453


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 3/10:
Train Loss: 2.0734 | Val Loss: 1.9181


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.74it/s]


Epoch 4/10:
Train Loss: 1.8403 | Val Loss: 1.7237


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 5/10:
Train Loss: 1.6864 | Val Loss: 1.6369


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 6/10:
Train Loss: 1.5723 | Val Loss: 1.5185


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.72it/s]


Epoch 7/10:
Train Loss: 1.4430 | Val Loss: 1.3732


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 8/10:
Train Loss: 1.2896 | Val Loss: 1.2336


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 9/10:
Train Loss: 1.1077 | Val Loss: 1.0759


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.74it/s]
[I 2025-05-09 17:24:05,737] Trial 37 finished with value: 0.8511115647852421 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1722527895993719, 'learning_rate': 0.00026107131809220767, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


Epoch 10/10:
Train Loss: 0.9395 | Val Loss: 0.8511


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.63it/s]


Epoch 1/10:
Train Loss: 2.5386 | Val Loss: 2.1433


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.64it/s]


Epoch 2/10:
Train Loss: 2.0653 | Val Loss: 1.9495


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.55it/s]


Epoch 3/10:
Train Loss: 1.8739 | Val Loss: 1.7320


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.64it/s]


Epoch 4/10:
Train Loss: 1.7109 | Val Loss: 1.5772


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.63it/s]


Epoch 5/10:
Train Loss: 1.5160 | Val Loss: 1.3203


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.58it/s]


Epoch 6/10:
Train Loss: 1.2717 | Val Loss: 0.9734


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.58it/s]


Epoch 7/10:
Train Loss: 0.9149 | Val Loss: 0.6180


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]


Epoch 8/10:
Train Loss: 0.6577 | Val Loss: 0.4206


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.59it/s]


Epoch 9/10:
Train Loss: 0.4927 | Val Loss: 0.3118


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]
[I 2025-05-09 17:30:01,284] Trial 38 finished with value: 0.26319647300988436 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.19764252572888064, 'learning_rate': 0.00013879579486365473, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


Epoch 10/10:
Train Loss: 0.3806 | Val Loss: 0.2632


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 1/10:
Train Loss: 2.4861 | Val Loss: 2.1023


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.92it/s]


Epoch 2/10:
Train Loss: 1.9745 | Val Loss: 1.8987


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 3/10:
Train Loss: 1.7405 | Val Loss: 1.5489


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 4/10:
Train Loss: 1.4767 | Val Loss: 1.2703


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 5/10:
Train Loss: 1.0472 | Val Loss: 0.6631


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 6/10:
Train Loss: 0.6426 | Val Loss: 0.4206


Training: 100%|██████████| 63/63 [00:30<00:00,  2.05it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 7/10:
Train Loss: 0.4386 | Val Loss: 0.2844


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 8/10:
Train Loss: 0.3250 | Val Loss: 0.2135


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.91it/s]


Epoch 9/10:
Train Loss: 0.2510 | Val Loss: 0.1710


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]
[I 2025-05-09 17:35:33,961] Trial 39 finished with value: 0.16256820037961006 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1275140892122229, 'learning_rate': 0.0001812161890064864, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


Epoch 10/10:
Train Loss: 0.2001 | Val Loss: 0.1626


Training: 100%|██████████| 63/63 [00:49<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 1/10:
Train Loss: 3.0891 | Val Loss: 2.9861


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.72it/s]


Epoch 2/10:
Train Loss: 2.9906 | Val Loss: 2.9771


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 3/10:
Train Loss: 2.9868 | Val Loss: 2.9843


Training: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.73it/s]


Epoch 4/10:
Train Loss: 2.9743 | Val Loss: 3.1325


Training: 100%|██████████| 63/63 [00:49<00:00,  1.26it/s]
Evaluating: 100%|██████████| 16/16 [00:04<00:00,  3.74it/s]
[I 2025-05-09 17:40:05,887] Trial 40 finished with value: 2.9771019369363785 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.2807229937271079, 'learning_rate': 0.000512689138819341, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


Epoch 5/10:
Train Loss: 2.9573 | Val Loss: 3.4079
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]


Epoch 1/10:
Train Loss: 3.0859 | Val Loss: 2.9751


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]


Epoch 2/10:
Train Loss: 2.6526 | Val Loss: 2.2361


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.91it/s]


Epoch 3/10:
Train Loss: 2.0845 | Val Loss: 1.9502


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 4/10:
Train Loss: 1.8487 | Val Loss: 1.7643


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 5/10:
Train Loss: 1.6946 | Val Loss: 1.6618


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 6/10:
Train Loss: 1.5928 | Val Loss: 1.5840


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 7/10:
Train Loss: 1.5137 | Val Loss: 1.5203


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 8/10:
Train Loss: 1.4511 | Val Loss: 1.4892


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 9/10:
Train Loss: 1.3936 | Val Loss: 1.4669


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]
[I 2025-05-09 17:45:38,585] Trial 41 finished with value: 1.4463816434144974 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11788710158125773, 'learning_rate': 0.0005081657607920895, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


Epoch 10/10:
Train Loss: 1.3496 | Val Loss: 1.4464


Training: 100%|██████████| 63/63 [00:14<00:00,  4.47it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.62it/s]


Epoch 1/10:
Train Loss: 2.6144 | Val Loss: 2.2197


Training: 100%|██████████| 63/63 [00:14<00:00,  4.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.58it/s]


Epoch 2/10:
Train Loss: 2.1519 | Val Loss: 2.0331


Training: 100%|██████████| 63/63 [00:14<00:00,  4.47it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.34it/s]


Epoch 3/10:
Train Loss: 1.9482 | Val Loss: 1.7877


Training: 100%|██████████| 63/63 [00:14<00:00,  4.47it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.55it/s]


Epoch 4/10:
Train Loss: 1.7510 | Val Loss: 1.5951


Training: 100%|██████████| 63/63 [00:14<00:00,  4.47it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.56it/s]


Epoch 5/10:
Train Loss: 1.5805 | Val Loss: 1.4598


Training: 100%|██████████| 63/63 [00:14<00:00,  4.44it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.53it/s]


Epoch 6/10:
Train Loss: 1.4169 | Val Loss: 1.2117


Training: 100%|██████████| 63/63 [00:14<00:00,  4.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.61it/s]


Epoch 7/10:
Train Loss: 1.1991 | Val Loss: 0.8724


Training: 100%|██████████| 63/63 [00:14<00:00,  4.47it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.62it/s]


Epoch 8/10:
Train Loss: 0.9298 | Val Loss: 0.6524


Training: 100%|██████████| 63/63 [00:14<00:00,  4.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.52it/s]


Epoch 9/10:
Train Loss: 0.7284 | Val Loss: 0.4613


Training: 100%|██████████| 63/63 [00:14<00:00,  4.46it/s]
Evaluating: 100%|██████████| 16/16 [00:01<00:00, 11.54it/s]
[I 2025-05-09 17:48:13,706] Trial 42 finished with value: 0.3311190586537123 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14851861422728052, 'learning_rate': 0.0009538354862658523, 'batch_size': 128}. Best is trial 35 with value: 0.10481278039515018.


Epoch 10/10:
Train Loss: 0.5790 | Val Loss: 0.3311


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.93it/s]


Epoch 1/10:
Train Loss: 2.6029 | Val Loss: 2.1340


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 2/10:
Train Loss: 1.9325 | Val Loss: 1.7233


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 3/10:
Train Loss: 1.4898 | Val Loss: 1.1438


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 4/10:
Train Loss: 0.8374 | Val Loss: 0.4937


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 5/10:
Train Loss: 0.4917 | Val Loss: 0.3026


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 6/10:
Train Loss: 0.3278 | Val Loss: 0.2294


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 7/10:
Train Loss: 0.2342 | Val Loss: 0.1747


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 8/10:
Train Loss: 0.1831 | Val Loss: 0.1389


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 9/10:
Train Loss: 0.1496 | Val Loss: 0.1251


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 10/10:
Train Loss: 0.1215 | Val Loss: 0.0859


[I 2025-05-09 17:53:46,731] Trial 43 finished with value: 0.08592505543492734 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.13185428585110107, 'learning_rate': 0.0003078672443612991, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


New best model found! Val Loss: 0.0859
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.13185428585110107, 'learning_rate': 0.0003078672443612991, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 1/10:
Train Loss: 2.5640 | Val Loss: 2.1170


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 2/10:
Train Loss: 1.9465 | Val Loss: 1.7434


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 3/10:
Train Loss: 1.6036 | Val Loss: 1.3454


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 4/10:
Train Loss: 1.1350 | Val Loss: 0.6913


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.00it/s]


Epoch 5/10:
Train Loss: 0.6648 | Val Loss: 0.4258


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 6/10:
Train Loss: 0.4449 | Val Loss: 0.3594


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 7/10:
Train Loss: 0.3256 | Val Loss: 0.2211


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 8/10:
Train Loss: 0.2479 | Val Loss: 0.1539


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 9/10:
Train Loss: 0.1962 | Val Loss: 0.1372


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]
[I 2025-05-09 17:59:18,974] Trial 44 finished with value: 0.11301642912440002 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1660188200616312, 'learning_rate': 0.000279238984783106, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.1570 | Val Loss: 0.1130


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.59it/s]


Epoch 1/10:
Train Loss: 2.9252 | Val Loss: 2.3426


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.61it/s]


Epoch 2/10:
Train Loss: 2.1834 | Val Loss: 2.0222


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.59it/s]


Epoch 3/10:
Train Loss: 1.9320 | Val Loss: 1.8068


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.56it/s]


Epoch 4/10:
Train Loss: 1.7414 | Val Loss: 1.6640


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.61it/s]


Epoch 5/10:
Train Loss: 1.5374 | Val Loss: 1.4231


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.59it/s]


Epoch 6/10:
Train Loss: 1.3003 | Val Loss: 1.1431


Training: 100%|██████████| 63/63 [00:40<00:00,  1.57it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.59it/s]


Epoch 7/10:
Train Loss: 0.9838 | Val Loss: 0.6594


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.59it/s]


Epoch 8/10:
Train Loss: 0.6102 | Val Loss: 0.4190


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.58it/s]


Epoch 9/10:
Train Loss: 0.4044 | Val Loss: 0.2802


Training: 100%|██████████| 63/63 [00:40<00:00,  1.56it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.59it/s]
[I 2025-05-09 18:06:37,041] Trial 45 finished with value: 0.2095473473891616 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.1683081767709153, 'learning_rate': 0.00030189802588164556, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.2820 | Val Loss: 0.2095


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]


Epoch 1/10:
Train Loss: 2.7059 | Val Loss: 2.2051


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]


Epoch 2/10:
Train Loss: 2.0892 | Val Loss: 1.9221


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]


Epoch 3/10:
Train Loss: 1.7836 | Val Loss: 1.5888


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.63it/s]


Epoch 4/10:
Train Loss: 1.4602 | Val Loss: 1.2087


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.61it/s]


Epoch 5/10:
Train Loss: 0.9798 | Val Loss: 0.5837


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]


Epoch 6/10:
Train Loss: 0.5927 | Val Loss: 0.4425


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]


Epoch 7/10:
Train Loss: 0.4045 | Val Loss: 0.2751


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.63it/s]


Epoch 8/10:
Train Loss: 0.2852 | Val Loss: 0.1892


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.63it/s]


Epoch 9/10:
Train Loss: 0.2162 | Val Loss: 0.1379


Training: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.62it/s]
[I 2025-05-09 18:12:32,497] Trial 46 finished with value: 0.13790942076593637 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.19986676870710085, 'learning_rate': 0.0003691249883352613, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.1696 | Val Loss: 0.2232


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.14it/s]


Epoch 1/10:
Train Loss: 3.0220 | Val Loss: 2.4748


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 2/10:
Train Loss: 2.2533 | Val Loss: 2.1150


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 3/10:
Train Loss: 1.9925 | Val Loss: 1.8534


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 4/10:
Train Loss: 1.8016 | Val Loss: 1.7053


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 5/10:
Train Loss: 1.6388 | Val Loss: 1.5614


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.13it/s]


Epoch 6/10:
Train Loss: 1.4599 | Val Loss: 1.3418


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 7/10:
Train Loss: 1.2693 | Val Loss: 1.1758


Training: 100%|██████████| 63/63 [00:59<00:00,  1.06it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]


Epoch 8/10:
Train Loss: 1.0482 | Val Loss: 0.8171


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.13it/s]


Epoch 9/10:
Train Loss: 0.6701 | Val Loss: 0.4404


Training: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s]
Evaluating: 100%|██████████| 16/16 [00:05<00:00,  3.11it/s]
[I 2025-05-09 18:23:21,429] Trial 47 finished with value: 0.3103277003392577 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.13552074118404336, 'learning_rate': 0.00018215702620863182, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.4312 | Val Loss: 0.3103


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.24it/s]


Epoch 1/10:
Train Loss: 2.8785 | Val Loss: 2.3044


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.16it/s]


Epoch 2/10:
Train Loss: 2.1533 | Val Loss: 1.9853


Training: 100%|██████████| 63/63 [00:25<00:00,  2.51it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.26it/s]


Epoch 3/10:
Train Loss: 1.9019 | Val Loss: 1.8030


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.24it/s]


Epoch 4/10:
Train Loss: 1.7433 | Val Loss: 1.6497


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.17it/s]


Epoch 5/10:
Train Loss: 1.5329 | Val Loss: 1.3690


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.23it/s]


Epoch 6/10:
Train Loss: 1.2472 | Val Loss: 0.9276


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.22it/s]


Epoch 7/10:
Train Loss: 0.7841 | Val Loss: 0.5068


Training: 100%|██████████| 63/63 [00:25<00:00,  2.51it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.26it/s]


Epoch 8/10:
Train Loss: 0.5246 | Val Loss: 0.3923


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.19it/s]


Epoch 9/10:
Train Loss: 0.3922 | Val Loss: 0.2688


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.27it/s]
[I 2025-05-09 18:27:55,589] Trial 48 finished with value: 0.24470712803304195 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.16095275837798984, 'learning_rate': 0.0004635054509079536, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.2903 | Val Loss: 0.2447


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.71it/s]


Epoch 1/10:
Train Loss: 2.8995 | Val Loss: 2.3740


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.65it/s]


Epoch 2/10:
Train Loss: 2.2984 | Val Loss: 2.1722


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.72it/s]


Epoch 3/10:
Train Loss: 2.1579 | Val Loss: 2.0568


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.72it/s]


Epoch 4/10:
Train Loss: 2.0607 | Val Loss: 1.9926


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.74it/s]


Epoch 5/10:
Train Loss: 1.9867 | Val Loss: 1.9057


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.57it/s]


Epoch 6/10:
Train Loss: 1.8980 | Val Loss: 1.8246


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.65it/s]


Epoch 7/10:
Train Loss: 1.8223 | Val Loss: 1.7355


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.75it/s]


Epoch 8/10:
Train Loss: 1.7549 | Val Loss: 1.6517


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.74it/s]


Epoch 9/10:
Train Loss: 1.6921 | Val Loss: 1.5972


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.76it/s]
[I 2025-05-09 18:33:28,334] Trial 49 finished with value: 1.5356021374464035 and parameters: {'d_model': 128, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.18105295409754935, 'learning_rate': 0.00029165165052686293, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 1.6427 | Val Loss: 1.5356


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 1/10:
Train Loss: 2.7405 | Val Loss: 2.2485


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 2/10:
Train Loss: 2.1301 | Val Loss: 1.9662


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.93it/s]


Epoch 3/10:
Train Loss: 1.8558 | Val Loss: 1.6727


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 4/10:
Train Loss: 1.5997 | Val Loss: 1.3825


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 5/10:
Train Loss: 1.3140 | Val Loss: 0.9247


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 6/10:
Train Loss: 0.9158 | Val Loss: 0.6620


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 7/10:
Train Loss: 0.6751 | Val Loss: 0.4762


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 8/10:
Train Loss: 0.5243 | Val Loss: 0.3122


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 9/10:
Train Loss: 0.4146 | Val Loss: 0.3312


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]
[I 2025-05-09 18:39:00,824] Trial 50 finished with value: 0.26790922321379185 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.3376300300122432, 'learning_rate': 0.00038417549124764375, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.3456 | Val Loss: 0.2679


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 1/10:
Train Loss: 2.5586 | Val Loss: 2.0897


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 2/10:
Train Loss: 1.9298 | Val Loss: 1.7323


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 3/10:
Train Loss: 1.5829 | Val Loss: 1.3829


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.91it/s]


Epoch 4/10:
Train Loss: 1.0854 | Val Loss: 0.6746


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 5/10:
Train Loss: 0.5948 | Val Loss: 0.4037


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]


Epoch 6/10:
Train Loss: 0.3791 | Val Loss: 0.2326


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.88it/s]


Epoch 7/10:
Train Loss: 0.2653 | Val Loss: 0.1714


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 8/10:
Train Loss: 0.1938 | Val Loss: 0.1502


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 9/10:
Train Loss: 0.1558 | Val Loss: 0.1195


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]
[I 2025-05-09 18:44:33,295] Trial 51 finished with value: 0.08617484872229397 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11374461787161659, 'learning_rate': 0.00024571702076428944, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.1297 | Val Loss: 0.0862


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 1/10:
Train Loss: 2.5596 | Val Loss: 2.1324


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 2/10:
Train Loss: 2.0663 | Val Loss: 1.9370


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 3/10:
Train Loss: 1.8602 | Val Loss: 1.7366


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 4/10:
Train Loss: 1.6713 | Val Loss: 1.5261


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.94it/s]


Epoch 5/10:
Train Loss: 1.4513 | Val Loss: 1.2732


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 6/10:
Train Loss: 1.1347 | Val Loss: 0.7953


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 7/10:
Train Loss: 0.7488 | Val Loss: 0.5112


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.91it/s]


Epoch 8/10:
Train Loss: 0.5300 | Val Loss: 0.3534


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.00it/s]


Epoch 9/10:
Train Loss: 0.3973 | Val Loss: 0.2935


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]
[I 2025-05-09 18:50:05,774] Trial 52 finished with value: 0.2133147157728672 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.13576841668044326, 'learning_rate': 0.00013830441963466592, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.3049 | Val Loss: 0.2133


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 1/10:
Train Loss: 2.5657 | Val Loss: 2.1916


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.90it/s]


Epoch 2/10:
Train Loss: 1.9537 | Val Loss: 1.7398


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 3/10:
Train Loss: 1.5745 | Val Loss: 1.3085


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.93it/s]


Epoch 4/10:
Train Loss: 1.0343 | Val Loss: 0.6095


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 5/10:
Train Loss: 0.5681 | Val Loss: 0.3559


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 6/10:
Train Loss: 0.3713 | Val Loss: 0.2514


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 7/10:
Train Loss: 0.2619 | Val Loss: 0.1653


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]


Epoch 8/10:
Train Loss: 0.1923 | Val Loss: 0.1405


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 9/10:
Train Loss: 0.1510 | Val Loss: 0.1176


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.99it/s]
[I 2025-05-09 18:55:38,251] Trial 53 finished with value: 0.09507956216111779 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11928224882704155, 'learning_rate': 0.000266692369881299, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.1248 | Val Loss: 0.0951


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.98it/s]


Epoch 1/10:
Train Loss: 2.6155 | Val Loss: 2.1832


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.99it/s]


Epoch 2/10:
Train Loss: 2.0494 | Val Loss: 1.9070


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.98it/s]


Epoch 3/10:
Train Loss: 1.7560 | Val Loss: 1.5842


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.99it/s]


Epoch 4/10:
Train Loss: 1.4277 | Val Loss: 1.2154


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  5.01it/s]


Epoch 5/10:
Train Loss: 0.9133 | Val Loss: 0.5714


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.99it/s]


Epoch 6/10:
Train Loss: 0.5018 | Val Loss: 0.3215


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.98it/s]


Epoch 7/10:
Train Loss: 0.3160 | Val Loss: 0.1965


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  5.01it/s]


Epoch 8/10:
Train Loss: 0.2101 | Val Loss: 0.2143


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  5.01it/s]


Epoch 9/10:
Train Loss: 0.1633 | Val Loss: 0.1262


Training: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.94it/s]
[I 2025-05-09 19:02:22,009] Trial 54 finished with value: 0.12620573677122593 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.10937467954567658, 'learning_rate': 0.00026835221026642306, 'batch_size': 128}. Best is trial 43 with value: 0.08592505543492734.


Epoch 10/10:
Train Loss: 0.1259 | Val Loss: 0.1297


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.91it/s]


Epoch 1/10:
Train Loss: 2.6638 | Val Loss: 2.1983


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.97it/s]


Epoch 2/10:
Train Loss: 2.0096 | Val Loss: 1.7891


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 3/10:
Train Loss: 1.5426 | Val Loss: 1.1540


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 4/10:
Train Loss: 0.8339 | Val Loss: 0.5014


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.95it/s]


Epoch 5/10:
Train Loss: 0.4597 | Val Loss: 0.3111


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 6/10:
Train Loss: 0.2970 | Val Loss: 0.1987


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 7/10:
Train Loss: 0.2029 | Val Loss: 0.1531


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 8/10:
Train Loss: 0.1555 | Val Loss: 0.1301


Training: 100%|██████████| 63/63 [00:30<00:00,  2.07it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.96it/s]


Epoch 9/10:
Train Loss: 0.1260 | Val Loss: 0.0972


Training: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]


Epoch 10/10:
Train Loss: 0.1065 | Val Loss: 0.0822


[I 2025-05-09 19:07:54,671] Trial 55 finished with value: 0.08217304898425937 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1125331367392462, 'learning_rate': 0.0003515867391592575, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


New best model found! Val Loss: 0.0822
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1125331367392462, 'learning_rate': 0.0003515867391592575, 'batch_size': 128}


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.08it/s]


Epoch 1/10:
Train Loss: 2.6371 | Val Loss: 2.2006


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.21it/s]


Epoch 2/10:
Train Loss: 1.9934 | Val Loss: 1.7212


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.24it/s]


Epoch 3/10:
Train Loss: 1.4451 | Val Loss: 0.9229


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.22it/s]


Epoch 4/10:
Train Loss: 0.7342 | Val Loss: 0.4891


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.24it/s]


Epoch 5/10:
Train Loss: 0.4533 | Val Loss: 0.3101


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.15it/s]


Epoch 6/10:
Train Loss: 0.3177 | Val Loss: 0.2196


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.23it/s]


Epoch 7/10:
Train Loss: 0.2250 | Val Loss: 0.1666


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.20it/s]


Epoch 8/10:
Train Loss: 0.1821 | Val Loss: 0.1316


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.24it/s]


Epoch 9/10:
Train Loss: 0.1482 | Val Loss: 0.1102


Training: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.23it/s]
[I 2025-05-09 19:12:28,965] Trial 56 finished with value: 0.10087786987423897 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1100732557943388, 'learning_rate': 0.0003567183197205478, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 10/10:
Train Loss: 0.1215 | Val Loss: 0.1009


Training: 100%|██████████| 63/63 [00:41<00:00,  1.53it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.53it/s]


Epoch 1/10:
Train Loss: 3.0772 | Val Loss: 2.9777


Training: 100%|██████████| 63/63 [00:41<00:00,  1.53it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.55it/s]


Epoch 2/10:
Train Loss: 2.9988 | Val Loss: 2.9773


Training: 100%|██████████| 63/63 [00:41<00:00,  1.53it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.54it/s]


Epoch 3/10:
Train Loss: 2.9824 | Val Loss: 2.9806


Training: 100%|██████████| 63/63 [00:41<00:00,  1.53it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.56it/s]


Epoch 4/10:
Train Loss: 2.9821 | Val Loss: 2.9787


Training: 100%|██████████| 63/63 [00:41<00:00,  1.53it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.56it/s]
[I 2025-05-09 19:16:12,394] Trial 57 finished with value: 2.9773357063531876 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1112895253798106, 'learning_rate': 0.0003427070667620144, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 5/10:
Train Loss: 2.9770 | Val Loss: 2.9986
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:31<00:00,  1.98it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.84it/s]


Epoch 1/10:
Train Loss: 3.0801 | Val Loss: 2.9779


Training: 100%|██████████| 63/63 [00:31<00:00,  1.98it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.82it/s]


Epoch 2/10:
Train Loss: 2.9866 | Val Loss: 2.9830


Training: 100%|██████████| 63/63 [00:31<00:00,  1.98it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.83it/s]


Epoch 3/10:
Train Loss: 2.9853 | Val Loss: 2.9822


Training: 100%|██████████| 63/63 [00:31<00:00,  1.98it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.87it/s]
[I 2025-05-09 19:18:30,793] Trial 58 finished with value: 2.97793772816658 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.14940962849213762, 'learning_rate': 0.0005612868373122483, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 4/10:
Train Loss: 2.9804 | Val Loss: 3.0165
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:33<00:00,  1.90it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.61it/s]


Epoch 1/10:
Train Loss: 3.0839 | Val Loss: 2.9799


Training: 100%|██████████| 63/63 [00:33<00:00,  1.90it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.58it/s]


Epoch 2/10:
Train Loss: 2.9853 | Val Loss: 2.9778


Training: 100%|██████████| 63/63 [00:33<00:00,  1.89it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.52it/s]


Epoch 3/10:
Train Loss: 2.9862 | Val Loss: 2.9790


Training: 100%|██████████| 63/63 [00:33<00:00,  1.90it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.46it/s]


Epoch 4/10:
Train Loss: 2.9847 | Val Loss: 2.9791


Training: 100%|██████████| 63/63 [00:33<00:00,  1.90it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  5.60it/s]
[I 2025-05-09 19:21:31,530] Trial 59 finished with value: 2.977843940258026 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.13142540448439888, 'learning_rate': 0.0004393468613530328, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 5/10:
Train Loss: 2.9765 | Val Loss: 3.0341
Early stopping triggered!


Training: 100%|██████████| 63/63 [00:27<00:00,  2.30it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Epoch 1/10:
Train Loss: 2.4934 | Val Loss: 2.0808


Training: 100%|██████████| 63/63 [00:27<00:00,  2.31it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Epoch 2/10:
Train Loss: 1.9234 | Val Loss: 1.7914


Training: 100%|██████████| 63/63 [00:27<00:00,  2.31it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Epoch 3/10:
Train Loss: 1.5796 | Val Loss: 1.2534


Training: 100%|██████████| 63/63 [00:27<00:00,  2.30it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Epoch 4/10:
Train Loss: 1.0186 | Val Loss: 0.6221


Training: 100%|██████████| 63/63 [00:27<00:00,  2.31it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.74it/s]


Epoch 5/10:
Train Loss: 0.6075 | Val Loss: 0.3892


Training: 100%|██████████| 63/63 [00:27<00:00,  2.31it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Epoch 6/10:
Train Loss: 0.4274 | Val Loss: 0.2956


Training: 100%|██████████| 63/63 [00:27<00:00,  2.31it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.69it/s]


Epoch 7/10:
Train Loss: 0.3241 | Val Loss: 0.2383


Training: 100%|██████████| 63/63 [00:27<00:00,  2.30it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Epoch 8/10:
Train Loss: 0.2589 | Val Loss: 0.2040


Training: 100%|██████████| 63/63 [00:27<00:00,  2.30it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Epoch 9/10:
Train Loss: 0.2121 | Val Loss: 0.2419


Training: 100%|██████████| 63/63 [00:27<00:00,  2.30it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]
[I 2025-05-09 19:26:29,161] Trial 60 finished with value: 0.12985380785539746 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.11643200122579085, 'learning_rate': 0.0001627951831864995, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 10/10:
Train Loss: 0.1771 | Val Loss: 0.1299


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 1/10:
Train Loss: 2.6821 | Val Loss: 2.1646


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 2/10:
Train Loss: 2.0203 | Val Loss: 1.8486


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.68it/s]


Epoch 3/10:
Train Loss: 1.6054 | Val Loss: 1.3493


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 4/10:
Train Loss: 0.9920 | Val Loss: 0.5472


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 5/10:
Train Loss: 0.5015 | Val Loss: 0.3401


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 6/10:
Train Loss: 0.2976 | Val Loss: 0.2118


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 7/10:
Train Loss: 0.1990 | Val Loss: 0.1234


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 8/10:
Train Loss: 0.1414 | Val Loss: 0.1161


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.63it/s]


Epoch 9/10:
Train Loss: 0.1079 | Val Loss: 0.1217


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.66it/s]
[I 2025-05-09 19:33:35,917] Trial 61 finished with value: 0.10686289705336094 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.10021660355215999, 'learning_rate': 0.000347132363789722, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 10/10:
Train Loss: 0.0906 | Val Loss: 0.1069


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 1/10:
Train Loss: 2.6567 | Val Loss: 2.1929


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 2/10:
Train Loss: 2.0211 | Val Loss: 1.8279


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 3/10:
Train Loss: 1.6540 | Val Loss: 1.4160


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.67it/s]


Epoch 4/10:
Train Loss: 1.1102 | Val Loss: 0.6612


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 5/10:
Train Loss: 0.5585 | Val Loss: 0.3663


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 6/10:
Train Loss: 0.3348 | Val Loss: 0.2176


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 7/10:
Train Loss: 0.2259 | Val Loss: 0.1725


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 8/10:
Train Loss: 0.1678 | Val Loss: 0.1275


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 9/10:
Train Loss: 0.1234 | Val Loss: 0.0941


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]
[I 2025-05-09 19:40:42,682] Trial 62 finished with value: 0.09311852790415287 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1001764557801352, 'learning_rate': 0.0003333313958300876, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 10/10:
Train Loss: 0.0989 | Val Loss: 0.0931


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 1/10:
Train Loss: 2.5068 | Val Loss: 2.0863


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 2/10:
Train Loss: 1.9591 | Val Loss: 1.8015


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.66it/s]


Epoch 3/10:
Train Loss: 1.6851 | Val Loss: 1.5193


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 4/10:
Train Loss: 1.3805 | Val Loss: 1.1119


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.68it/s]


Epoch 5/10:
Train Loss: 0.8610 | Val Loss: 0.5142


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 6/10:
Train Loss: 0.4960 | Val Loss: 0.3158


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 7/10:
Train Loss: 0.3232 | Val Loss: 0.2120


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 8/10:
Train Loss: 0.2277 | Val Loss: 0.1566


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.69it/s]


Epoch 9/10:
Train Loss: 0.1745 | Val Loss: 0.1280


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]
[I 2025-05-09 19:47:49,442] Trial 63 finished with value: 0.09668544982559979 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11196629212412189, 'learning_rate': 0.00023772266110638055, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 10/10:
Train Loss: 0.1317 | Val Loss: 0.0967


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 1/10:
Train Loss: 2.5350 | Val Loss: 2.1286


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 2/10:
Train Loss: 1.9735 | Val Loss: 1.7992


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 3/10:
Train Loss: 1.6859 | Val Loss: 1.5579


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 4/10:
Train Loss: 1.3561 | Val Loss: 1.0439


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.69it/s]


Epoch 5/10:
Train Loss: 0.8119 | Val Loss: 0.4865


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 6/10:
Train Loss: 0.4766 | Val Loss: 0.3544


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 7/10:
Train Loss: 0.3132 | Val Loss: 0.2304


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 8/10:
Train Loss: 0.2220 | Val Loss: 0.1685


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 9/10:
Train Loss: 0.1693 | Val Loss: 0.1223


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]
[I 2025-05-09 19:54:56,120] Trial 64 finished with value: 0.11061024572700262 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11147006732536316, 'learning_rate': 0.00023604357592705403, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 10/10:
Train Loss: 0.1376 | Val Loss: 0.1106


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 1/10:
Train Loss: 2.5740 | Val Loss: 2.1859


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.71it/s]


Epoch 2/10:
Train Loss: 2.0812 | Val Loss: 1.9663


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.69it/s]


Epoch 3/10:
Train Loss: 1.9029 | Val Loss: 1.7757


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 4/10:
Train Loss: 1.7402 | Val Loss: 1.6354


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 5/10:
Train Loss: 1.6073 | Val Loss: 1.4841


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.67it/s]


Epoch 6/10:
Train Loss: 1.4800 | Val Loss: 1.3584


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.72it/s]


Epoch 7/10:
Train Loss: 1.3447 | Val Loss: 1.2031


Training: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.70it/s]


Epoch 8/10:
Train Loss: 1.1647 | Val Loss: 0.9905


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.68it/s]


Epoch 9/10:
Train Loss: 0.8818 | Val Loss: 0.6287


Training: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]
Evaluating: 100%|██████████| 16/16 [00:03<00:00,  4.69it/s]
[I 2025-05-09 20:02:02,776] Trial 65 finished with value: 0.44829947873950005 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1315420336099651, 'learning_rate': 0.00012277474690283982, 'batch_size': 128}. Best is trial 55 with value: 0.08217304898425937.


Epoch 10/10:
Train Loss: 0.6386 | Val Loss: 0.4483


Training: 100%|██████████| 63/63 [00:23<00:00,  2.73it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.44it/s]


Epoch 1/10:
Train Loss: 2.9138 | Val Loss: 2.3684


Training: 100%|██████████| 63/63 [00:23<00:00,  2.73it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.47it/s]


Epoch 2/10:
Train Loss: 2.2835 | Val Loss: 2.1614


Training: 100%|██████████| 63/63 [00:23<00:00,  2.73it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.43it/s]


Epoch 3/10:
Train Loss: 2.1259 | Val Loss: 2.0255


Training: 100%|██████████| 63/63 [00:23<00:00,  2.72it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.45it/s]


Epoch 4/10:
Train Loss: 2.0141 | Val Loss: 1.9259


Training: 100%|██████████| 63/63 [00:23<00:00,  2.72it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.44it/s]


Epoch 5/10:
Train Loss: 1.9175 | Val Loss: 1.8359


Training: 100%|██████████| 63/63 [00:23<00:00,  2.72it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.44it/s]


Epoch 6/10:
Train Loss: 1.8399 | Val Loss: 1.7500


Training: 100%|██████████| 63/63 [00:23<00:00,  2.73it/s]
Evaluating: 100%|██████████| 16/16 [00:02<00:00,  7.44it/s]


Epoch 7/10:
Train Loss: 1.7622 | Val Loss: 1.6861


Training:  84%|████████▍ | 53/63 [00:19<00:03,  2.69it/s]
[W 2025-05-09 20:05:19,503] Trial 66 failed with parameters: {'d_model': 128, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14189016758343986, 'learning_rate': 0.00032315386363982064, 'batch_size': 128} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-1-9b10502612f3>", line 338, in objective
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-9b10502612f3>", line 278, in train_model
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
                 ^^

KeyboardInterrupt: 