In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Preparation

def load_data(file_path):
    df = pd.read_excel(file_path)

    df = df[df['output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)

    if count_filtered > 2000:
        df = df.sample(n=2000, random_state=42)

    inputs = df['input'].tolist()
    outputs = df['output'].tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break
    return best_val_loss  # Return the best validation loss from this training run

# Global variables to track best model across all trials
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Hyperparameter Optimization with Optuna
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [128, 256, 512]),
        "num_heads": trial.suggest_categorical("num_heads", [2, 4, 8, 16]),
        "num_layers": trial.suggest_categorical("num_layers", [6, 8, 10, 12]),
        "d_ff": trial.suggest_categorical("d_ff", [256, 512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss for this configuration
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_mono_key_5.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('/content/Full_training_mono_5.xlsx')
    vocab = Vocabulary()
    max_length = 256  # Adjusted for longer sentences

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)  # 20 trials or 1 hour

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('/content/drive/MyDrive/best_mono_key_5.pth'))

    # Evaluate on full datasets
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher:xibu", "what")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cuda


[I 2025-05-09 12:43:42,406] A new study created in memory with name: no-name-74b64105-7156-49bd-9d17-dc5df0ca17b3
Training: 100%|██████████| 50/50 [00:09<00:00,  5.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.85it/s]


Epoch 1/10:
Train Loss: 3.0001 | Val Loss: 2.4297


Training: 100%|██████████| 50/50 [00:08<00:00,  6.24it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.82it/s]


Epoch 2/10:
Train Loss: 2.3450 | Val Loss: 2.2102


Training: 100%|██████████| 50/50 [00:08<00:00,  6.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.87it/s]


Epoch 3/10:
Train Loss: 2.1865 | Val Loss: 2.0932


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.88it/s]


Epoch 4/10:
Train Loss: 2.0723 | Val Loss: 1.9941


Training: 100%|██████████| 50/50 [00:08<00:00,  6.24it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.83it/s]


Epoch 5/10:
Train Loss: 1.9688 | Val Loss: 1.9087


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.81it/s]


Epoch 6/10:
Train Loss: 1.8258 | Val Loss: 1.6893


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.88it/s]


Epoch 7/10:
Train Loss: 1.6560 | Val Loss: 1.5424


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.86it/s]


Epoch 8/10:
Train Loss: 1.4702 | Val Loss: 1.3583


Training: 100%|██████████| 50/50 [00:08<00:00,  6.24it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.80it/s]


Epoch 9/10:
Train Loss: 1.2519 | Val Loss: 1.0749


Training: 100%|██████████| 50/50 [00:08<00:00,  6.24it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.87it/s]


Epoch 10/10:
Train Loss: 1.0849 | Val Loss: 0.8793


[I 2025-05-09 12:45:17,239] Trial 0 finished with value: 0.8792857711131756 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.3150478112674391, 'learning_rate': 0.00023269305557873076, 'batch_size': 32}. Best is trial 0 with value: 0.8792857711131756.


New best model found! Val Loss: 0.8793
Config: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.3150478112674391, 'learning_rate': 0.00023269305557873076, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:06<00:00,  8.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.25it/s]


Epoch 1/10:
Train Loss: 3.1354 | Val Loss: 2.5251


Training: 100%|██████████| 50/50 [00:06<00:00,  8.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.50it/s]


Epoch 2/10:
Train Loss: 2.4513 | Val Loss: 2.2707


Training: 100%|██████████| 50/50 [00:06<00:00,  8.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.50it/s]


Epoch 3/10:
Train Loss: 2.2834 | Val Loss: 2.1753


Training: 100%|██████████| 50/50 [00:06<00:00,  8.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.17it/s]


Epoch 4/10:
Train Loss: 2.2091 | Val Loss: 2.1064


Training: 100%|██████████| 50/50 [00:06<00:00,  8.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.56it/s]


Epoch 5/10:
Train Loss: 2.1427 | Val Loss: 2.0429


Training: 100%|██████████| 50/50 [00:06<00:00,  8.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 31.92it/s]


Epoch 6/10:
Train Loss: 2.0962 | Val Loss: 2.0300


Training: 100%|██████████| 50/50 [00:06<00:00,  8.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.16it/s]


Epoch 7/10:
Train Loss: 2.0633 | Val Loss: 1.9769


Training: 100%|██████████| 50/50 [00:06<00:00,  8.20it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.34it/s]


Epoch 8/10:
Train Loss: 2.0152 | Val Loss: 1.9525


Training: 100%|██████████| 50/50 [00:06<00:00,  8.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.43it/s]


Epoch 9/10:
Train Loss: 1.9671 | Val Loss: 1.9227


Training: 100%|██████████| 50/50 [00:06<00:00,  8.20it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.48it/s]
[I 2025-05-09 12:46:22,512] Trial 1 finished with value: 1.8487728375654955 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2803917864388922, 'learning_rate': 0.00012486176082041945, 'batch_size': 32}. Best is trial 0 with value: 0.8792857711131756.


Epoch 10/10:
Train Loss: 1.9172 | Val Loss: 1.8488


Training: 100%|██████████| 50/50 [00:08<00:00,  5.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.10it/s]


Epoch 1/10:
Train Loss: 3.0957 | Val Loss: 2.9794


Training: 100%|██████████| 50/50 [00:08<00:00,  5.64it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.57it/s]


Epoch 2/10:
Train Loss: 2.9992 | Val Loss: 2.9695


Training: 100%|██████████| 50/50 [00:08<00:00,  5.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.54it/s]


Epoch 3/10:
Train Loss: 2.9910 | Val Loss: 2.9698


Training: 100%|██████████| 50/50 [00:08<00:00,  5.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.49it/s]


Epoch 4/10:
Train Loss: 2.9860 | Val Loss: 2.9688


Training: 100%|██████████| 50/50 [00:08<00:00,  5.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.48it/s]


Epoch 5/10:
Train Loss: 2.9835 | Val Loss: 2.9743


Training: 100%|██████████| 50/50 [00:08<00:00,  5.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.47it/s]


Epoch 6/10:
Train Loss: 2.9815 | Val Loss: 2.9722


Training: 100%|██████████| 50/50 [00:08<00:00,  5.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.15it/s]
[I 2025-05-09 12:47:28,434] Trial 2 finished with value: 2.9687948410327616 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.16194888073978017, 'learning_rate': 0.004428606704583406, 'batch_size': 32}. Best is trial 0 with value: 0.8792857711131756.


Epoch 7/10:
Train Loss: 2.9822 | Val Loss: 2.9691
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:02<00:00, 19.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 59.91it/s]


Epoch 1/10:
Train Loss: 2.9853 | Val Loss: 2.4017


Training: 100%|██████████| 50/50 [00:02<00:00, 19.25it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 60.97it/s]


Epoch 2/10:
Train Loss: 2.3978 | Val Loss: 2.2745


Training: 100%|██████████| 50/50 [00:02<00:00, 19.33it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 56.86it/s]


Epoch 3/10:
Train Loss: 2.2957 | Val Loss: 2.1991


Training: 100%|██████████| 50/50 [00:02<00:00, 19.34it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 60.15it/s]


Epoch 4/10:
Train Loss: 2.2184 | Val Loss: 2.1260


Training: 100%|██████████| 50/50 [00:02<00:00, 19.33it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 60.85it/s]


Epoch 5/10:
Train Loss: 2.1597 | Val Loss: 2.0677


Training: 100%|██████████| 50/50 [00:02<00:00, 19.30it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 60.94it/s]


Epoch 6/10:
Train Loss: 2.1020 | Val Loss: 1.9957


Training: 100%|██████████| 50/50 [00:02<00:00, 19.43it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 60.93it/s]


Epoch 7/10:
Train Loss: 2.0504 | Val Loss: 1.9496


Training: 100%|██████████| 50/50 [00:02<00:00, 19.63it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 57.63it/s]


Epoch 8/10:
Train Loss: 1.9877 | Val Loss: 1.8815


Training: 100%|██████████| 50/50 [00:02<00:00, 19.57it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 60.77it/s]


Epoch 9/10:
Train Loss: 1.9245 | Val Loss: 1.8033


Training: 100%|██████████| 50/50 [00:02<00:00, 18.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 60.43it/s]
[I 2025-05-09 12:47:56,595] Trial 3 finished with value: 1.7180115167911236 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.341479322572031, 'learning_rate': 0.0006186610492265709, 'batch_size': 32}. Best is trial 0 with value: 0.8792857711131756.


Epoch 10/10:
Train Loss: 1.8717 | Val Loss: 1.7180


Training: 100%|██████████| 50/50 [00:05<00:00,  9.10it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.16it/s]


Epoch 1/10:
Train Loss: 2.8896 | Val Loss: 2.3429


Training: 100%|██████████| 50/50 [00:05<00:00,  9.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.88it/s]


Epoch 2/10:
Train Loss: 2.3069 | Val Loss: 2.2153


Training: 100%|██████████| 50/50 [00:05<00:00,  9.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.32it/s]


Epoch 3/10:
Train Loss: 2.1473 | Val Loss: 2.0436


Training: 100%|██████████| 50/50 [00:05<00:00,  9.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.20it/s]


Epoch 4/10:
Train Loss: 2.0185 | Val Loss: 1.9655


Training: 100%|██████████| 50/50 [00:05<00:00,  9.05it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.30it/s]


Epoch 5/10:
Train Loss: 1.9142 | Val Loss: 1.8689


Training: 100%|██████████| 50/50 [00:05<00:00,  9.05it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.74it/s]


Epoch 6/10:
Train Loss: 1.7845 | Val Loss: 1.6997


Training: 100%|██████████| 50/50 [00:05<00:00,  9.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.05it/s]


Epoch 7/10:
Train Loss: 1.5823 | Val Loss: 1.4660


Training: 100%|██████████| 50/50 [00:05<00:00,  9.11it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.13it/s]


Epoch 8/10:
Train Loss: 1.3475 | Val Loss: 1.2971


Training: 100%|██████████| 50/50 [00:05<00:00,  9.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.33it/s]


Epoch 9/10:
Train Loss: 1.1159 | Val Loss: 0.9639


Training: 100%|██████████| 50/50 [00:05<00:00,  9.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.12it/s]
[I 2025-05-09 12:48:55,499] Trial 4 finished with value: 0.7704469011380122 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.15534663113015706, 'learning_rate': 0.0005080773373284311, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 10/10:
Train Loss: 0.8837 | Val Loss: 0.7704
New best model found! Val Loss: 0.7704
Config: {'d_model': 256, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.15534663113015706, 'learning_rate': 0.0005080773373284311, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.35it/s]


Epoch 1/10:
Train Loss: 3.0920 | Val Loss: 2.9745


Training: 100%|██████████| 50/50 [00:09<00:00,  5.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.37it/s]


Epoch 2/10:
Train Loss: 2.9960 | Val Loss: 2.9743


Training: 100%|██████████| 50/50 [00:09<00:00,  5.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.45it/s]


Epoch 3/10:
Train Loss: 2.9899 | Val Loss: 2.9690


Training: 100%|██████████| 50/50 [00:09<00:00,  5.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.36it/s]


Epoch 4/10:
Train Loss: 2.9877 | Val Loss: 2.9740


Training: 100%|██████████| 50/50 [00:09<00:00,  5.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.36it/s]


Epoch 5/10:
Train Loss: 2.9879 | Val Loss: 2.9687


Training: 100%|██████████| 50/50 [00:09<00:00,  5.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.43it/s]


Epoch 6/10:
Train Loss: 2.9879 | Val Loss: 2.9715


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.39it/s]


Epoch 7/10:
Train Loss: 2.9808 | Val Loss: 2.9622


Training: 100%|██████████| 50/50 [00:09<00:00,  5.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.36it/s]


Epoch 8/10:
Train Loss: 2.9783 | Val Loss: 2.9607


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.42it/s]


Epoch 9/10:
Train Loss: 2.9787 | Val Loss: 2.9616


Training: 100%|██████████| 50/50 [00:09<00:00,  5.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.39it/s]
[I 2025-05-09 12:50:39,208] Trial 5 finished with value: 2.9606942580296445 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.12503810591714795, 'learning_rate': 0.001330592779571503, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 10/10:
Train Loss: 2.9774 | Val Loss: 2.9624


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.49it/s]


Epoch 1/10:
Train Loss: 3.1018 | Val Loss: 3.0002


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.47it/s]


Epoch 2/10:
Train Loss: 2.9994 | Val Loss: 2.9906


Training: 100%|██████████| 50/50 [00:11<00:00,  4.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.49it/s]


Epoch 3/10:
Train Loss: 2.9905 | Val Loss: 2.9868


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.48it/s]


Epoch 4/10:
Train Loss: 2.9868 | Val Loss: 2.9886


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.46it/s]


Epoch 5/10:
Train Loss: 2.9845 | Val Loss: 2.9799


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.47it/s]


Epoch 6/10:
Train Loss: 2.9825 | Val Loss: 2.9814


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.46it/s]


Epoch 7/10:
Train Loss: 2.9824 | Val Loss: 2.9799


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.49it/s]


Epoch 8/10:
Train Loss: 2.9814 | Val Loss: 2.9851


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.45it/s]


Epoch 9/10:
Train Loss: 2.9793 | Val Loss: 2.9742


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.49it/s]
[I 2025-05-09 12:52:41,649] Trial 6 finished with value: 2.9741641924931455 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.37966222313990194, 'learning_rate': 0.0035244093028499886, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 10/10:
Train Loss: 2.9778 | Val Loss: 2.9758


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.63it/s]


Epoch 1/10:
Train Loss: 3.1606 | Val Loss: 2.9972


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.65it/s]


Epoch 2/10:
Train Loss: 3.0069 | Val Loss: 2.9933


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.66it/s]


Epoch 3/10:
Train Loss: 2.9978 | Val Loss: 2.9804


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.66it/s]


Epoch 4/10:
Train Loss: 2.9908 | Val Loss: 2.9760


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.66it/s]


Epoch 5/10:
Train Loss: 2.9874 | Val Loss: 2.9825


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.65it/s]


Epoch 6/10:
Train Loss: 2.9847 | Val Loss: 2.9841


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.60it/s]
[I 2025-05-09 12:54:21,341] Trial 7 finished with value: 2.975977659225464 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.3686662714333505, 'learning_rate': 0.0020645414234068946, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 7/10:
Train Loss: 2.9842 | Val Loss: 2.9801
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:06<00:00,  7.50it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.58it/s]


Epoch 1/10:
Train Loss: 3.3211 | Val Loss: 2.6820


Training: 100%|██████████| 50/50 [00:06<00:00,  7.46it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.69it/s]


Epoch 2/10:
Train Loss: 2.5636 | Val Loss: 2.3677


Training: 100%|██████████| 50/50 [00:06<00:00,  7.52it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.57it/s]


Epoch 3/10:
Train Loss: 2.3560 | Val Loss: 2.2392


Training: 100%|██████████| 50/50 [00:06<00:00,  7.50it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.67it/s]


Epoch 4/10:
Train Loss: 2.2550 | Val Loss: 2.1597


Training: 100%|██████████| 50/50 [00:06<00:00,  7.51it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.62it/s]


Epoch 5/10:
Train Loss: 2.1843 | Val Loss: 2.1008


Training: 100%|██████████| 50/50 [00:06<00:00,  7.49it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.63it/s]


Epoch 6/10:
Train Loss: 2.1292 | Val Loss: 2.0849


Training: 100%|██████████| 50/50 [00:06<00:00,  7.47it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.64it/s]


Epoch 7/10:
Train Loss: 2.0879 | Val Loss: 2.0209


Training: 100%|██████████| 50/50 [00:06<00:00,  7.49it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.32it/s]


Epoch 8/10:
Train Loss: 2.0496 | Val Loss: 2.0032


Training: 100%|██████████| 50/50 [00:06<00:00,  7.49it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.66it/s]


Epoch 9/10:
Train Loss: 2.0150 | Val Loss: 1.9541


Training: 100%|██████████| 50/50 [00:06<00:00,  7.49it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.65it/s]
[I 2025-05-09 12:55:34,224] Trial 8 finished with value: 1.9260413921796358 and parameters: {'d_model': 128, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.1704482265351851, 'learning_rate': 0.00019880816210242583, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 10/10:
Train Loss: 1.9777 | Val Loss: 1.9260


Training: 100%|██████████| 50/50 [00:04<00:00, 12.04it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 32.98it/s]


Epoch 1/10:
Train Loss: 3.0674 | Val Loss: 2.5074


Training: 100%|██████████| 50/50 [00:04<00:00, 12.10it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.48it/s]


Epoch 2/10:
Train Loss: 2.4329 | Val Loss: 2.2965


Training: 100%|██████████| 50/50 [00:04<00:00, 11.49it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.50it/s]


Epoch 3/10:
Train Loss: 2.2699 | Val Loss: 2.1722


Training: 100%|██████████| 50/50 [00:04<00:00, 12.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.41it/s]


Epoch 4/10:
Train Loss: 2.1826 | Val Loss: 2.0962


Training: 100%|██████████| 50/50 [00:04<00:00, 11.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.47it/s]


Epoch 5/10:
Train Loss: 2.1042 | Val Loss: 2.0425


Training: 100%|██████████| 50/50 [00:04<00:00, 12.01it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.08it/s]


Epoch 6/10:
Train Loss: 2.0430 | Val Loss: 1.9876


Training: 100%|██████████| 50/50 [00:04<00:00, 11.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.27it/s]


Epoch 7/10:
Train Loss: 1.9813 | Val Loss: 1.9290


Training: 100%|██████████| 50/50 [00:04<00:00, 11.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.61it/s]


Epoch 8/10:
Train Loss: 1.8998 | Val Loss: 1.8366


Training: 100%|██████████| 50/50 [00:04<00:00, 12.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.36it/s]


Epoch 9/10:
Train Loss: 1.8260 | Val Loss: 1.7682


Training: 100%|██████████| 50/50 [00:04<00:00, 12.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.22it/s]
[I 2025-05-09 12:56:20,085] Trial 9 finished with value: 1.697761948292072 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.10764780868619649, 'learning_rate': 0.0002347595643384507, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 10/10:
Train Loss: 1.7473 | Val Loss: 1.6978


Training: 100%|██████████| 50/50 [00:06<00:00,  7.31it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.11it/s]


Epoch 1/10:
Train Loss: 3.1380 | Val Loss: 2.9869


Training: 100%|██████████| 50/50 [00:06<00:00,  7.32it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.18it/s]


Epoch 2/10:
Train Loss: 3.0000 | Val Loss: 2.9807


Training: 100%|██████████| 50/50 [00:06<00:00,  7.34it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.22it/s]


Epoch 3/10:
Train Loss: 2.9900 | Val Loss: 2.9741


Training: 100%|██████████| 50/50 [00:06<00:00,  7.32it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.20it/s]


Epoch 4/10:
Train Loss: 2.9853 | Val Loss: 2.9688


Training: 100%|██████████| 50/50 [00:06<00:00,  7.34it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 29.97it/s]


Epoch 5/10:
Train Loss: 2.9843 | Val Loss: 2.9727


Training: 100%|██████████| 50/50 [00:06<00:00,  7.34it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.13it/s]


Epoch 6/10:
Train Loss: 2.9831 | Val Loss: 2.9766


Training: 100%|██████████| 50/50 [00:06<00:00,  7.33it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.12it/s]
[I 2025-05-09 12:57:11,074] Trial 10 finished with value: 2.9688096596644473 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.2204794669478093, 'learning_rate': 0.008964316735597272, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 7/10:
Train Loss: 2.9832 | Val Loss: 2.9757
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:08<00:00,  6.24it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.79it/s]


Epoch 1/10:
Train Loss: 3.1192 | Val Loss: 2.9773


Training: 100%|██████████| 50/50 [00:08<00:00,  6.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.84it/s]


Epoch 2/10:
Train Loss: 3.0029 | Val Loss: 2.9849


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.76it/s]


Epoch 3/10:
Train Loss: 3.0008 | Val Loss: 2.9655


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.82it/s]


Epoch 4/10:
Train Loss: 2.9950 | Val Loss: 2.9735


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.86it/s]


Epoch 5/10:
Train Loss: 2.9908 | Val Loss: 2.9672


Training: 100%|██████████| 50/50 [00:08<00:00,  6.21it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.88it/s]
[I 2025-05-09 12:58:03,812] Trial 11 finished with value: 2.9655242149646464 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.2672571887420552, 'learning_rate': 0.0005242413026319762, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 6/10:
Train Loss: 2.9911 | Val Loss: 2.9754
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:05<00:00,  8.83it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.07it/s]


Epoch 1/10:
Train Loss: 3.0266 | Val Loss: 2.4675


Training: 100%|██████████| 50/50 [00:05<00:00,  8.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.98it/s]


Epoch 2/10:
Train Loss: 2.3823 | Val Loss: 2.2733


Training: 100%|██████████| 50/50 [00:05<00:00,  8.80it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.97it/s]


Epoch 3/10:
Train Loss: 2.2499 | Val Loss: 2.1955


Training: 100%|██████████| 50/50 [00:05<00:00,  8.82it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.02it/s]


Epoch 4/10:
Train Loss: 2.1589 | Val Loss: 2.1183


Training: 100%|██████████| 50/50 [00:05<00:00,  8.81it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.02it/s]


Epoch 5/10:
Train Loss: 2.0908 | Val Loss: 2.0512


Training: 100%|██████████| 50/50 [00:05<00:00,  8.82it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.88it/s]


Epoch 6/10:
Train Loss: 2.0390 | Val Loss: 2.0080


Training: 100%|██████████| 50/50 [00:05<00:00,  8.77it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.96it/s]


Epoch 7/10:
Train Loss: 1.9966 | Val Loss: 1.9680


Training: 100%|██████████| 50/50 [00:05<00:00,  8.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.10it/s]


Epoch 8/10:
Train Loss: 1.9503 | Val Loss: 1.9301


Training: 100%|██████████| 50/50 [00:05<00:00,  8.80it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.05it/s]


Epoch 9/10:
Train Loss: 1.9180 | Val Loss: 1.9283


Training: 100%|██████████| 50/50 [00:05<00:00,  8.82it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.87it/s]
[I 2025-05-09 12:59:05,860] Trial 12 finished with value: 1.9050105901864858 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.31437915581599496, 'learning_rate': 0.00041106348336042734, 'batch_size': 32}. Best is trial 4 with value: 0.7704469011380122.


Epoch 10/10:
Train Loss: 1.8873 | Val Loss: 1.9050


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.83it/s]


Epoch 1/10:
Train Loss: 2.7888 | Val Loss: 2.2853


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.78it/s]


Epoch 2/10:
Train Loss: 2.2419 | Val Loss: 2.1394


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.75it/s]


Epoch 3/10:
Train Loss: 2.1050 | Val Loss: 2.0151


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.77it/s]


Epoch 4/10:
Train Loss: 1.9749 | Val Loss: 1.8816


Training: 100%|██████████| 50/50 [00:08<00:00,  6.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.81it/s]


Epoch 5/10:
Train Loss: 1.8283 | Val Loss: 1.6852


Training: 100%|██████████| 50/50 [00:08<00:00,  6.24it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.78it/s]


Epoch 6/10:
Train Loss: 1.6562 | Val Loss: 1.5211


Training: 100%|██████████| 50/50 [00:08<00:00,  6.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.75it/s]


Epoch 7/10:
Train Loss: 1.4794 | Val Loss: 1.2928


Training: 100%|██████████| 50/50 [00:08<00:00,  6.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.65it/s]


Epoch 8/10:
Train Loss: 1.2640 | Val Loss: 1.0744


Training: 100%|██████████| 50/50 [00:08<00:00,  6.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.37it/s]


Epoch 9/10:
Train Loss: 1.0610 | Val Loss: 0.8584


Training: 100%|██████████| 50/50 [00:08<00:00,  6.20it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.75it/s]


Epoch 10/10:
Train Loss: 0.8771 | Val Loss: 0.6785


[I 2025-05-09 13:00:33,925] Trial 13 finished with value: 0.6785048544406891 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.21538419301800565, 'learning_rate': 0.00010002571242999882, 'batch_size': 32}. Best is trial 13 with value: 0.6785048544406891.


New best model found! Val Loss: 0.6785
Config: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.21538419301800565, 'learning_rate': 0.00010002571242999882, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.98it/s]


Epoch 1/10:
Train Loss: 2.7734 | Val Loss: 2.2780


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.04it/s]


Epoch 2/10:
Train Loss: 2.2359 | Val Loss: 2.1252


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.97it/s]


Epoch 3/10:
Train Loss: 2.1154 | Val Loss: 2.0902


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.04it/s]


Epoch 4/10:
Train Loss: 2.0177 | Val Loss: 1.9704


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.06it/s]


Epoch 5/10:
Train Loss: 1.9087 | Val Loss: 1.8195


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.07it/s]


Epoch 6/10:
Train Loss: 1.7728 | Val Loss: 1.6570


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.06it/s]


Epoch 7/10:
Train Loss: 1.6037 | Val Loss: 1.4796


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.04it/s]


Epoch 8/10:
Train Loss: 1.4159 | Val Loss: 1.2529


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.04it/s]


Epoch 9/10:
Train Loss: 1.2159 | Val Loss: 1.1115


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.00it/s]
[I 2025-05-09 13:02:30,877] Trial 14 finished with value: 0.904225794168619 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.21021201533455733, 'learning_rate': 0.00010768505311220902, 'batch_size': 32}. Best is trial 13 with value: 0.6785048544406891.


Epoch 10/10:
Train Loss: 1.0145 | Val Loss: 0.9042


Training: 100%|██████████| 50/50 [00:05<00:00,  9.21it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.74it/s]


Epoch 1/10:
Train Loss: 3.1011 | Val Loss: 2.9688


Training: 100%|██████████| 50/50 [00:05<00:00,  9.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.73it/s]


Epoch 2/10:
Train Loss: 2.9930 | Val Loss: 2.9711


Training: 100%|██████████| 50/50 [00:05<00:00,  9.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.87it/s]


Epoch 3/10:
Train Loss: 2.9925 | Val Loss: 2.9723


Training: 100%|██████████| 50/50 [00:05<00:00,  9.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.63it/s]


Epoch 4/10:
Train Loss: 2.9914 | Val Loss: 2.9678


Training: 100%|██████████| 50/50 [00:05<00:00,  9.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.68it/s]


Epoch 5/10:
Train Loss: 2.9859 | Val Loss: 2.9788


Training: 100%|██████████| 50/50 [00:05<00:00,  9.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.64it/s]


Epoch 6/10:
Train Loss: 2.9788 | Val Loss: 3.0479


Training: 100%|██████████| 50/50 [00:05<00:00,  9.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.49it/s]
[I 2025-05-09 13:03:11,803] Trial 15 finished with value: 2.9678082832923303 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.16888796679591075, 'learning_rate': 0.000929406569870346, 'batch_size': 32}. Best is trial 13 with value: 0.6785048544406891.


Epoch 7/10:
Train Loss: 2.9717 | Val Loss: 3.0835
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:05<00:00,  8.78it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.93it/s]


Epoch 1/10:
Train Loss: 2.7718 | Val Loss: 2.2915


Training: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.93it/s]


Epoch 2/10:
Train Loss: 2.2044 | Val Loss: 2.0421


Training: 100%|██████████| 50/50 [00:05<00:00,  8.81it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.98it/s]


Epoch 3/10:
Train Loss: 1.9240 | Val Loss: 1.6922


Training: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.87it/s]


Epoch 4/10:
Train Loss: 1.5603 | Val Loss: 1.2884


Training: 100%|██████████| 50/50 [00:05<00:00,  8.81it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.00it/s]


Epoch 5/10:
Train Loss: 1.1512 | Val Loss: 0.8904


Training: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.84it/s]


Epoch 6/10:
Train Loss: 0.8088 | Val Loss: 0.5535


Training: 100%|██████████| 50/50 [00:05<00:00,  8.80it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.90it/s]


Epoch 7/10:
Train Loss: 0.5846 | Val Loss: 0.4389


Training: 100%|██████████| 50/50 [00:05<00:00,  8.78it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.98it/s]


Epoch 8/10:
Train Loss: 0.4623 | Val Loss: 0.3645


Training: 100%|██████████| 50/50 [00:05<00:00,  8.81it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.94it/s]


Epoch 9/10:
Train Loss: 0.3696 | Val Loss: 0.3194


Training: 100%|██████████| 50/50 [00:05<00:00,  8.82it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.91it/s]


Epoch 10/10:
Train Loss: 0.3098 | Val Loss: 0.2741


[I 2025-05-09 13:04:14,230] Trial 16 finished with value: 0.2741393790795253 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.21983729110648678, 'learning_rate': 0.0003134263502032441, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


New best model found! Val Loss: 0.2741
Config: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.21983729110648678, 'learning_rate': 0.0003134263502032441, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:05<00:00,  8.83it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.09it/s]


Epoch 1/10:
Train Loss: 2.7226 | Val Loss: 2.2841


Training: 100%|██████████| 50/50 [00:05<00:00,  8.78it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.87it/s]


Epoch 2/10:
Train Loss: 2.2074 | Val Loss: 2.0932


Training: 100%|██████████| 50/50 [00:05<00:00,  8.81it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.09it/s]


Epoch 3/10:
Train Loss: 2.0286 | Val Loss: 1.8942


Training: 100%|██████████| 50/50 [00:05<00:00,  8.78it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.57it/s]


Epoch 4/10:
Train Loss: 1.8192 | Val Loss: 1.6648


Training: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.01it/s]


Epoch 5/10:
Train Loss: 1.5954 | Val Loss: 1.3236


Training: 100%|██████████| 50/50 [00:05<00:00,  8.80it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.60it/s]


Epoch 6/10:
Train Loss: 1.3073 | Val Loss: 1.0255


Training: 100%|██████████| 50/50 [00:05<00:00,  8.81it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.05it/s]


Epoch 7/10:
Train Loss: 1.0065 | Val Loss: 0.7840


Training: 100%|██████████| 50/50 [00:05<00:00,  8.81it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.01it/s]


Epoch 8/10:
Train Loss: 0.7750 | Val Loss: 0.6539


Training: 100%|██████████| 50/50 [00:05<00:00,  8.77it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.97it/s]


Epoch 9/10:
Train Loss: 0.6084 | Val Loss: 0.4676


Training: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.04it/s]
[I 2025-05-09 13:05:16,381] Trial 17 finished with value: 0.41015340043948245 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2290676562795207, 'learning_rate': 0.00016253310330070967, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.4850 | Val Loss: 0.4102


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 1/10:
Train Loss: 2.7879 | Val Loss: 2.2542


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]


Epoch 2/10:
Train Loss: 2.1993 | Val Loss: 2.0718


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.80it/s]


Epoch 3/10:
Train Loss: 2.0136 | Val Loss: 1.8740


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.64it/s]


Epoch 4/10:
Train Loss: 1.7591 | Val Loss: 1.5603


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]


Epoch 5/10:
Train Loss: 1.4425 | Val Loss: 1.2032


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 6/10:
Train Loss: 1.1101 | Val Loss: 0.8654


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.89it/s]


Epoch 7/10:
Train Loss: 0.8303 | Val Loss: 0.6892


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.82it/s]


Epoch 8/10:
Train Loss: 0.6436 | Val Loss: 0.5216


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s]


Epoch 9/10:
Train Loss: 0.5120 | Val Loss: 0.4971


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.69it/s]
[I 2025-05-09 13:06:30,254] Trial 18 finished with value: 0.40738963851561916 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.24465313967553773, 'learning_rate': 0.00030669603190078043, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.4248 | Val Loss: 0.4074


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 1/10:
Train Loss: 2.8122 | Val Loss: 2.2547


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 2/10:
Train Loss: 2.2035 | Val Loss: 2.0652


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.92it/s]


Epoch 3/10:
Train Loss: 2.0057 | Val Loss: 1.8591


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.68it/s]


Epoch 4/10:
Train Loss: 1.7515 | Val Loss: 1.5659


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 5/10:
Train Loss: 1.4372 | Val Loss: 1.1804


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]


Epoch 6/10:
Train Loss: 1.1013 | Val Loss: 0.8963


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]


Epoch 7/10:
Train Loss: 0.8326 | Val Loss: 0.6640


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.70it/s]


Epoch 8/10:
Train Loss: 0.6565 | Val Loss: 0.6177


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.66it/s]


Epoch 9/10:
Train Loss: 0.5283 | Val Loss: 0.5138


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.66it/s]
[I 2025-05-09 13:07:44,108] Trial 19 finished with value: 0.4216865576230563 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.24526870715848578, 'learning_rate': 0.00031926998067436605, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.4365 | Val Loss: 0.4217


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.89it/s]


Epoch 1/10:
Train Loss: 3.1185 | Val Loss: 2.9698


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]


Epoch 2/10:
Train Loss: 3.0034 | Val Loss: 2.9758


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.64it/s]


Epoch 3/10:
Train Loss: 3.0009 | Val Loss: 2.9733


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.70it/s]
[I 2025-05-09 13:08:13,788] Trial 20 finished with value: 2.96981719823984 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.28779187240516557, 'learning_rate': 0.0008865815229900904, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 4/10:
Train Loss: 2.9957 | Val Loss: 2.9702
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 1/10:
Train Loss: 2.7242 | Val Loss: 2.2484


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.88it/s]


Epoch 2/10:
Train Loss: 2.2078 | Val Loss: 2.0990


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s]


Epoch 3/10:
Train Loss: 2.0786 | Val Loss: 2.0301


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]


Epoch 4/10:
Train Loss: 1.9670 | Val Loss: 1.9134


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.63it/s]


Epoch 5/10:
Train Loss: 1.8254 | Val Loss: 1.7364


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.66it/s]


Epoch 6/10:
Train Loss: 1.6550 | Val Loss: 1.4875


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.91it/s]


Epoch 7/10:
Train Loss: 1.4402 | Val Loss: 1.2406


Training: 100%|██████████| 50/50 [00:06<00:00,  7.36it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]


Epoch 8/10:
Train Loss: 1.1910 | Val Loss: 1.0329


Training: 100%|██████████| 50/50 [00:06<00:00,  7.36it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.86it/s]


Epoch 9/10:
Train Loss: 0.9644 | Val Loss: 0.7663


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]
[I 2025-05-09 13:09:27,761] Trial 21 finished with value: 0.620722390138186 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.23579015412381102, 'learning_rate': 0.00015997710306920384, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.7810 | Val Loss: 0.6207


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]


Epoch 1/10:
Train Loss: 2.7679 | Val Loss: 2.2200


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 2/10:
Train Loss: 2.1725 | Val Loss: 2.0384


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 3/10:
Train Loss: 1.9385 | Val Loss: 1.7834


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 4/10:
Train Loss: 1.6192 | Val Loss: 1.3734


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 5/10:
Train Loss: 1.2183 | Val Loss: 0.9940


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.67it/s]


Epoch 6/10:
Train Loss: 0.8596 | Val Loss: 0.6656


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.80it/s]


Epoch 7/10:
Train Loss: 0.6271 | Val Loss: 0.5527


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.82it/s]


Epoch 8/10:
Train Loss: 0.5005 | Val Loss: 0.5484


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.66it/s]


Epoch 9/10:
Train Loss: 0.4093 | Val Loss: 0.3853


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.87it/s]
[I 2025-05-09 13:10:41,701] Trial 22 finished with value: 0.3481279027003508 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.19059969660014736, 'learning_rate': 0.0003202338795433898, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3268 | Val Loss: 0.3481


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s]


Epoch 1/10:
Train Loss: 2.7732 | Val Loss: 2.2741


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.84it/s]


Epoch 2/10:
Train Loss: 2.1787 | Val Loss: 2.0610


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.58it/s]


Epoch 3/10:
Train Loss: 1.9542 | Val Loss: 1.8239


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 4/10:
Train Loss: 1.6376 | Val Loss: 1.4116


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 5/10:
Train Loss: 1.2352 | Val Loss: 1.0029


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.51it/s]


Epoch 6/10:
Train Loss: 0.8741 | Val Loss: 0.7632


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.88it/s]


Epoch 7/10:
Train Loss: 0.6477 | Val Loss: 0.5502


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 8/10:
Train Loss: 0.4834 | Val Loss: 0.4922


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 9/10:
Train Loss: 0.3967 | Val Loss: 0.3670


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]
[I 2025-05-09 13:11:55,619] Trial 23 finished with value: 0.34894911371744597 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.18634869063331058, 'learning_rate': 0.0003161862700508268, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3220 | Val Loss: 0.3489


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 1/10:
Train Loss: 2.8631 | Val Loss: 2.2778


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]


Epoch 2/10:
Train Loss: 2.2177 | Val Loss: 2.0905


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 3/10:
Train Loss: 2.0036 | Val Loss: 1.8541


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.77it/s]


Epoch 4/10:
Train Loss: 1.7105 | Val Loss: 1.5371


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 5/10:
Train Loss: 1.3288 | Val Loss: 1.0509


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.80it/s]


Epoch 6/10:
Train Loss: 0.9562 | Val Loss: 0.7786


Training: 100%|██████████| 50/50 [00:06<00:00,  7.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.86it/s]


Epoch 7/10:
Train Loss: 0.6789 | Val Loss: 0.5913


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 8/10:
Train Loss: 0.5248 | Val Loss: 0.4992


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]


Epoch 9/10:
Train Loss: 0.4155 | Val Loss: 0.4309


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]
[I 2025-05-09 13:13:09,541] Trial 24 finished with value: 0.3896801953132336 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.18676627636028117, 'learning_rate': 0.0003371878464164983, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3587 | Val Loss: 0.3897


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 1/10:
Train Loss: 3.1111 | Val Loss: 2.9724


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.62it/s]


Epoch 2/10:
Train Loss: 3.0032 | Val Loss: 2.9692


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.88it/s]


Epoch 3/10:
Train Loss: 2.9994 | Val Loss: 2.9723


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.90it/s]


Epoch 4/10:
Train Loss: 2.9925 | Val Loss: 2.9702


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.80it/s]


Epoch 5/10:
Train Loss: 2.9921 | Val Loss: 2.9654


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.70it/s]


Epoch 6/10:
Train Loss: 2.9917 | Val Loss: 2.9711


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.52it/s]


Epoch 7/10:
Train Loss: 2.9883 | Val Loss: 2.9709


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]
[I 2025-05-09 13:14:08,759] Trial 25 finished with value: 2.965437357242291 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1919073440950927, 'learning_rate': 0.0007054805454438007, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 8/10:
Train Loss: 2.9864 | Val Loss: 2.9664
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:03<00:00, 13.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.03it/s]


Epoch 1/10:
Train Loss: 2.8288 | Val Loss: 2.3728


Training: 100%|██████████| 50/50 [00:03<00:00, 13.63it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.08it/s]


Epoch 2/10:
Train Loss: 2.3490 | Val Loss: 2.2324


Training: 100%|██████████| 50/50 [00:03<00:00, 13.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.35it/s]


Epoch 3/10:
Train Loss: 2.2372 | Val Loss: 2.1308


Training: 100%|██████████| 50/50 [00:03<00:00, 13.65it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.82it/s]


Epoch 4/10:
Train Loss: 2.1466 | Val Loss: 2.0693


Training: 100%|██████████| 50/50 [00:03<00:00, 13.61it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.37it/s]


Epoch 5/10:
Train Loss: 2.0758 | Val Loss: 2.0286


Training: 100%|██████████| 50/50 [00:03<00:00, 13.68it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.22it/s]


Epoch 6/10:
Train Loss: 2.0008 | Val Loss: 1.9477


Training: 100%|██████████| 50/50 [00:03<00:00, 13.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.02it/s]


Epoch 7/10:
Train Loss: 1.9302 | Val Loss: 1.8625


Training: 100%|██████████| 50/50 [00:03<00:00, 13.68it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.52it/s]


Epoch 8/10:
Train Loss: 1.8454 | Val Loss: 1.8031


Training: 100%|██████████| 50/50 [00:03<00:00, 13.69it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.51it/s]


Epoch 9/10:
Train Loss: 1.7565 | Val Loss: 1.7064


Training: 100%|██████████| 50/50 [00:03<00:00, 13.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.72it/s]
[I 2025-05-09 13:14:49,006] Trial 26 finished with value: 1.5617558497648973 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.13756776828201078, 'learning_rate': 0.00044118606270243606, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.6452 | Val Loss: 1.5618


Training: 100%|██████████| 50/50 [00:11<00:00,  4.46it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.62it/s]


Epoch 1/10:
Train Loss: 3.1803 | Val Loss: 2.9792


Training: 100%|██████████| 50/50 [00:10<00:00,  4.55it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.65it/s]


Epoch 2/10:
Train Loss: 3.0047 | Val Loss: 2.9741


Training: 100%|██████████| 50/50 [00:11<00:00,  4.54it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.62it/s]


Epoch 3/10:
Train Loss: 2.9993 | Val Loss: 2.9776


Training: 100%|██████████| 50/50 [00:11<00:00,  4.54it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.59it/s]


Epoch 4/10:
Train Loss: 2.9952 | Val Loss: 2.9746


Training: 100%|██████████| 50/50 [00:11<00:00,  4.54it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.60it/s]


Epoch 5/10:
Train Loss: 2.9922 | Val Loss: 2.9721


Training: 100%|██████████| 50/50 [00:11<00:00,  4.54it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.60it/s]


Epoch 6/10:
Train Loss: 2.9895 | Val Loss: 2.9722


Training: 100%|██████████| 50/50 [00:11<00:00,  4.54it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.63it/s]


Epoch 7/10:
Train Loss: 2.9857 | Val Loss: 2.9652


Training: 100%|██████████| 50/50 [00:10<00:00,  4.55it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.63it/s]


Epoch 8/10:
Train Loss: 2.9850 | Val Loss: 2.9669


Training: 100%|██████████| 50/50 [00:10<00:00,  4.55it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.65it/s]


Epoch 9/10:
Train Loss: 2.9827 | Val Loss: 2.9685


Training: 100%|██████████| 50/50 [00:11<00:00,  4.54it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.61it/s]
[I 2025-05-09 13:16:49,291] Trial 27 finished with value: 2.965205889481765 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.1899092686600321, 'learning_rate': 0.0018200801894870057, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 2.9805 | Val Loss: 2.9667
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:16<00:00,  2.99it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.11it/s]


Epoch 1/10:
Train Loss: 3.1107 | Val Loss: 2.9701


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.10it/s]


Epoch 2/10:
Train Loss: 3.0024 | Val Loss: 2.9485


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.11it/s]


Epoch 3/10:
Train Loss: 2.9680 | Val Loss: 2.9404


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.10it/s]


Epoch 4/10:
Train Loss: 2.9600 | Val Loss: 2.9379


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.09it/s]


Epoch 5/10:
Train Loss: 2.7877 | Val Loss: 2.4126


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.09it/s]


Epoch 6/10:
Train Loss: 2.3295 | Val Loss: 2.2159


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.07it/s]


Epoch 7/10:
Train Loss: 2.1847 | Val Loss: 2.1289


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.08it/s]


Epoch 8/10:
Train Loss: 2.0693 | Val Loss: 2.0487


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.03it/s]


Epoch 9/10:
Train Loss: 1.9557 | Val Loss: 1.9461


Training: 100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.11it/s]
[I 2025-05-09 13:19:50,969] Trial 28 finished with value: 1.8985285208775446 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.14378444600544033, 'learning_rate': 0.00025076519171831637, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.8619 | Val Loss: 1.8985


Training: 100%|██████████| 50/50 [00:05<00:00,  8.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.74it/s]


Epoch 1/10:
Train Loss: 2.7297 | Val Loss: 2.2505


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.56it/s]


Epoch 2/10:
Train Loss: 2.1819 | Val Loss: 2.1144


Training: 100%|██████████| 50/50 [00:05<00:00,  8.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.75it/s]


Epoch 3/10:
Train Loss: 2.0256 | Val Loss: 1.9030


Training: 100%|██████████| 50/50 [00:05<00:00,  8.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.62it/s]


Epoch 4/10:
Train Loss: 1.8291 | Val Loss: 1.6706


Training: 100%|██████████| 50/50 [00:05<00:00,  8.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.70it/s]


Epoch 5/10:
Train Loss: 1.5927 | Val Loss: 1.3645


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.74it/s]


Epoch 6/10:
Train Loss: 1.2984 | Val Loss: 1.0514


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.66it/s]


Epoch 7/10:
Train Loss: 1.0005 | Val Loss: 0.7383


Training: 100%|██████████| 50/50 [00:05<00:00,  8.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.69it/s]


Epoch 8/10:
Train Loss: 0.7820 | Val Loss: 0.5917


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.59it/s]


Epoch 9/10:
Train Loss: 0.5896 | Val Loss: 0.4818


Training: 100%|██████████| 50/50 [00:05<00:00,  8.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.67it/s]
[I 2025-05-09 13:20:56,051] Trial 29 finished with value: 0.3955779064160127 and parameters: {'d_model': 512, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.20501626959769023, 'learning_rate': 0.0001724118701794731, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.4705 | Val Loss: 0.3956


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.11it/s]


Epoch 1/10:
Train Loss: 3.1473 | Val Loss: 3.0007


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.27it/s]


Epoch 2/10:
Train Loss: 3.0026 | Val Loss: 2.9798


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.19it/s]


Epoch 3/10:
Train Loss: 3.0001 | Val Loss: 2.9677


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.20it/s]


Epoch 4/10:
Train Loss: 2.9955 | Val Loss: 2.9738


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.10it/s]


Epoch 5/10:
Train Loss: 2.9880 | Val Loss: 2.9735


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.15it/s]
[I 2025-05-09 13:21:49,533] Trial 30 finished with value: 2.967732337804941 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.1099081410562688, 'learning_rate': 0.0013039849897736443, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 6/10:
Train Loss: 2.9901 | Val Loss: 2.9695
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 1/10:
Train Loss: 2.7890 | Val Loss: 2.2806


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 2/10:
Train Loss: 2.1843 | Val Loss: 2.0592


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 3/10:
Train Loss: 1.9540 | Val Loss: 1.8094


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]


Epoch 4/10:
Train Loss: 1.6351 | Val Loss: 1.3465


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.69it/s]


Epoch 5/10:
Train Loss: 1.2435 | Val Loss: 0.9936


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.77it/s]


Epoch 6/10:
Train Loss: 0.8810 | Val Loss: 0.8958


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 7/10:
Train Loss: 0.6548 | Val Loss: 0.5267


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]


Epoch 8/10:
Train Loss: 0.4890 | Val Loss: 0.4470


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.80it/s]


Epoch 9/10:
Train Loss: 0.3972 | Val Loss: 0.3732


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.80it/s]
[I 2025-05-09 13:23:03,427] Trial 31 finished with value: 0.32912770830667937 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.20191960359161437, 'learning_rate': 0.00035590688070474217, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3190 | Val Loss: 0.3291


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.75it/s]


Epoch 1/10:
Train Loss: 2.7518 | Val Loss: 2.2426


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]


Epoch 2/10:
Train Loss: 2.1762 | Val Loss: 2.0635


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.63it/s]


Epoch 3/10:
Train Loss: 1.9994 | Val Loss: 1.8663


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.84it/s]


Epoch 4/10:
Train Loss: 1.7910 | Val Loss: 1.6698


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 5/10:
Train Loss: 1.5296 | Val Loss: 1.3619


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.85it/s]


Epoch 6/10:
Train Loss: 1.2230 | Val Loss: 0.9960


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 7/10:
Train Loss: 0.9375 | Val Loss: 0.7738


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 8/10:
Train Loss: 0.7342 | Val Loss: 0.5986


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]


Epoch 9/10:
Train Loss: 0.5880 | Val Loss: 0.5410


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s]
[I 2025-05-09 13:24:17,352] Trial 32 finished with value: 0.4617319909425882 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2616485972263927, 'learning_rate': 0.0002825213841488526, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.4923 | Val Loss: 0.4617


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.51it/s]


Epoch 1/10:
Train Loss: 2.8586 | Val Loss: 2.2940


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.84it/s]


Epoch 2/10:
Train Loss: 2.2362 | Val Loss: 2.1177


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.85it/s]


Epoch 3/10:
Train Loss: 2.0461 | Val Loss: 1.9088


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s]


Epoch 4/10:
Train Loss: 1.7348 | Val Loss: 1.5562


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.77it/s]


Epoch 5/10:
Train Loss: 1.3600 | Val Loss: 1.0958


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.82it/s]


Epoch 6/10:
Train Loss: 0.9730 | Val Loss: 0.7851


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 7/10:
Train Loss: 0.7105 | Val Loss: 0.6057


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.65it/s]


Epoch 8/10:
Train Loss: 0.5569 | Val Loss: 0.5363


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]


Epoch 9/10:
Train Loss: 0.4531 | Val Loss: 0.4797


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]
[I 2025-05-09 13:25:31,253] Trial 33 finished with value: 0.43362253445845383 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.18727753999654015, 'learning_rate': 0.00038238713405214177, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3874 | Val Loss: 0.4336


Training: 100%|██████████| 50/50 [00:05<00:00,  8.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.55it/s]


Epoch 1/10:
Train Loss: 3.1239 | Val Loss: 2.9745


Training: 100%|██████████| 50/50 [00:05<00:00,  8.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.66it/s]


Epoch 2/10:
Train Loss: 3.0007 | Val Loss: 2.9684


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.50it/s]


Epoch 3/10:
Train Loss: 3.0006 | Val Loss: 2.9768


Training: 100%|██████████| 50/50 [00:05<00:00,  8.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.62it/s]


Epoch 4/10:
Train Loss: 2.9952 | Val Loss: 2.9734


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.45it/s]
[I 2025-05-09 13:26:03,910] Trial 34 finished with value: 2.9683692088493934 and parameters: {'d_model': 512, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.201902965236205, 'learning_rate': 0.0006632569823831755, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 5/10:
Train Loss: 3.1024 | Val Loss: 3.4560
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:06<00:00,  8.05it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.46it/s]


Epoch 1/10:
Train Loss: 3.2885 | Val Loss: 2.6500


Training: 100%|██████████| 50/50 [00:06<00:00,  8.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.70it/s]


Epoch 2/10:
Train Loss: 2.5361 | Val Loss: 2.3382


Training: 100%|██████████| 50/50 [00:06<00:00,  8.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.70it/s]


Epoch 3/10:
Train Loss: 2.3428 | Val Loss: 2.2302


Training: 100%|██████████| 50/50 [00:06<00:00,  8.07it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.76it/s]


Epoch 4/10:
Train Loss: 2.2432 | Val Loss: 2.1360


Training: 100%|██████████| 50/50 [00:06<00:00,  8.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.85it/s]


Epoch 5/10:
Train Loss: 2.1731 | Val Loss: 2.0874


Training: 100%|██████████| 50/50 [00:06<00:00,  7.82it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.83it/s]


Epoch 6/10:
Train Loss: 2.1174 | Val Loss: 2.0638


Training: 100%|██████████| 50/50 [00:06<00:00,  8.07it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.75it/s]


Epoch 7/10:
Train Loss: 2.0693 | Val Loss: 2.0052


Training: 100%|██████████| 50/50 [00:06<00:00,  8.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.67it/s]


Epoch 8/10:
Train Loss: 2.0278 | Val Loss: 1.9614


Training: 100%|██████████| 50/50 [00:06<00:00,  8.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.55it/s]


Epoch 9/10:
Train Loss: 1.9884 | Val Loss: 1.9598


Training: 100%|██████████| 50/50 [00:06<00:00,  8.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 22.86it/s]
[I 2025-05-09 13:27:11,945] Trial 35 finished with value: 1.9063926751797016 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.1684646591020423, 'learning_rate': 0.00021114124869728009, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.9532 | Val Loss: 1.9064


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.75it/s]


Epoch 1/10:
Train Loss: 2.8157 | Val Loss: 2.2974


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.85it/s]


Epoch 2/10:
Train Loss: 2.2567 | Val Loss: 2.1407


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]


Epoch 3/10:
Train Loss: 2.1296 | Val Loss: 2.0827


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.66it/s]


Epoch 4/10:
Train Loss: 2.0425 | Val Loss: 1.9766


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.67it/s]


Epoch 5/10:
Train Loss: 1.9395 | Val Loss: 1.8426


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.87it/s]


Epoch 6/10:
Train Loss: 1.8099 | Val Loss: 1.6935


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.62it/s]


Epoch 7/10:
Train Loss: 1.6716 | Val Loss: 1.5376


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 8/10:
Train Loss: 1.5211 | Val Loss: 1.3445


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.88it/s]


Epoch 9/10:
Train Loss: 1.3340 | Val Loss: 1.1549


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.89it/s]
[I 2025-05-09 13:28:25,866] Trial 36 finished with value: 0.9670928808359 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.29792506640641614, 'learning_rate': 0.00014966218417990584, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.1649 | Val Loss: 0.9671


Training: 100%|██████████| 50/50 [00:05<00:00,  8.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.81it/s]


Epoch 1/10:
Train Loss: 3.1057 | Val Loss: 2.9722


Training: 100%|██████████| 50/50 [00:05<00:00,  8.78it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.85it/s]


Epoch 2/10:
Train Loss: 3.0008 | Val Loss: 2.9774


Training: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.92it/s]


Epoch 3/10:
Train Loss: 2.9994 | Val Loss: 2.9744


Training: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.77it/s]


Epoch 4/10:
Train Loss: 2.8577 | Val Loss: 2.6685


Training: 100%|██████████| 50/50 [00:05<00:00,  8.76it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.91it/s]


Epoch 5/10:
Train Loss: 2.7842 | Val Loss: 2.9974


Training: 100%|██████████| 50/50 [00:05<00:00,  8.80it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.95it/s]


Epoch 6/10:
Train Loss: 2.9884 | Val Loss: 2.9664


Training: 100%|██████████| 50/50 [00:05<00:00,  8.78it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.99it/s]
[I 2025-05-09 13:29:09,550] Trial 37 finished with value: 2.6684791308182936 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2617719889327361, 'learning_rate': 0.000561248945456702, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 7/10:
Train Loss: 2.9885 | Val Loss: 2.9643
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.55it/s]


Epoch 1/10:
Train Loss: 3.1218 | Val Loss: 2.9835


Training: 100%|██████████| 50/50 [00:09<00:00,  5.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.53it/s]


Epoch 2/10:
Train Loss: 2.9962 | Val Loss: 2.9764


Training: 100%|██████████| 50/50 [00:09<00:00,  5.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.29it/s]


Epoch 3/10:
Train Loss: 2.9932 | Val Loss: 2.9695


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.55it/s]


Epoch 4/10:
Train Loss: 2.9927 | Val Loss: 2.9677


Training: 100%|██████████| 50/50 [00:09<00:00,  5.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.53it/s]


Epoch 5/10:
Train Loss: 2.9906 | Val Loss: 2.9665


Training: 100%|██████████| 50/50 [00:09<00:00,  5.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.45it/s]


Epoch 6/10:
Train Loss: 2.9895 | Val Loss: 2.9652


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.55it/s]


Epoch 7/10:
Train Loss: 2.9874 | Val Loss: 2.9680


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.56it/s]


Epoch 8/10:
Train Loss: 2.9883 | Val Loss: 2.9693


Training: 100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.53it/s]
[I 2025-05-09 13:30:44,514] Trial 38 finished with value: 2.9652136655954213 and parameters: {'d_model': 512, 'num_heads': 4, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.150538531113119, 'learning_rate': 0.0004507026349570243, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 9/10:
Train Loss: 2.9752 | Val Loss: 3.0493
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.59it/s]


Epoch 1/10:
Train Loss: 2.9966 | Val Loss: 2.4352


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.53it/s]


Epoch 2/10:
Train Loss: 2.3690 | Val Loss: 2.2111


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.55it/s]


Epoch 3/10:
Train Loss: 2.2085 | Val Loss: 2.1056


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.57it/s]


Epoch 4/10:
Train Loss: 2.1105 | Val Loss: 2.0436


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.60it/s]


Epoch 5/10:
Train Loss: 2.0369 | Val Loss: 1.9809


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.50it/s]


Epoch 6/10:
Train Loss: 1.9749 | Val Loss: 1.9607


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.60it/s]


Epoch 7/10:
Train Loss: 1.9248 | Val Loss: 1.8972


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.55it/s]


Epoch 8/10:
Train Loss: 1.8702 | Val Loss: 1.8732


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.59it/s]


Epoch 9/10:
Train Loss: 1.8081 | Val Loss: 1.8244


Training: 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.46it/s]
[I 2025-05-09 13:33:18,548] Trial 39 finished with value: 1.7321841166569636 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.12568852211812118, 'learning_rate': 0.00013583701089204473, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.7377 | Val Loss: 1.7322


Training: 100%|██████████| 50/50 [00:03<00:00, 13.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.32it/s]


Epoch 1/10:
Train Loss: 3.1258 | Val Loss: 2.5183


Training: 100%|██████████| 50/50 [00:03<00:00, 13.68it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.55it/s]


Epoch 2/10:
Train Loss: 2.4701 | Val Loss: 2.3151


Training: 100%|██████████| 50/50 [00:03<00:00, 13.64it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.65it/s]


Epoch 3/10:
Train Loss: 2.3192 | Val Loss: 2.2030


Training: 100%|██████████| 50/50 [00:03<00:00, 13.62it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.24it/s]


Epoch 4/10:
Train Loss: 2.2311 | Val Loss: 2.1339


Training: 100%|██████████| 50/50 [00:03<00:00, 13.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.50it/s]


Epoch 5/10:
Train Loss: 2.1600 | Val Loss: 2.0840


Training: 100%|██████████| 50/50 [00:03<00:00, 13.57it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.24it/s]


Epoch 6/10:
Train Loss: 2.1139 | Val Loss: 2.0376


Training: 100%|██████████| 50/50 [00:03<00:00, 13.64it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.12it/s]


Epoch 7/10:
Train Loss: 2.0649 | Val Loss: 2.0045


Training: 100%|██████████| 50/50 [00:03<00:00, 13.63it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.14it/s]


Epoch 8/10:
Train Loss: 2.0292 | Val Loss: 1.9626


Training: 100%|██████████| 50/50 [00:03<00:00, 13.60it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.99it/s]


Epoch 9/10:
Train Loss: 1.9786 | Val Loss: 1.8964


Training: 100%|██████████| 50/50 [00:03<00:00, 13.65it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.87it/s]
[I 2025-05-09 13:33:58,869] Trial 40 finished with value: 1.8489121473752534 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.22720702690905337, 'learning_rate': 0.0002548774803554707, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.9204 | Val Loss: 1.8489


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.75it/s]


Epoch 1/10:
Train Loss: 2.7921 | Val Loss: 2.2515


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.87it/s]


Epoch 2/10:
Train Loss: 2.1884 | Val Loss: 2.0717


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 3/10:
Train Loss: 1.9713 | Val Loss: 1.8443


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 4/10:
Train Loss: 1.6525 | Val Loss: 1.4132


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]


Epoch 5/10:
Train Loss: 1.2434 | Val Loss: 0.9700


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 6/10:
Train Loss: 0.8600 | Val Loss: 0.6846


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.64it/s]


Epoch 7/10:
Train Loss: 0.6055 | Val Loss: 0.5420


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 8/10:
Train Loss: 0.4776 | Val Loss: 0.4426


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]


Epoch 9/10:
Train Loss: 0.3835 | Val Loss: 0.3580


Training: 100%|██████████| 50/50 [00:06<00:00,  7.36it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.64it/s]
[I 2025-05-09 13:35:12,816] Trial 41 finished with value: 0.3514665078658324 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.18285765384914693, 'learning_rate': 0.000316954535547952, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3090 | Val Loss: 0.3515


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 1/10:
Train Loss: 2.7109 | Val Loss: 2.2057


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.70it/s]


Epoch 2/10:
Train Loss: 2.1665 | Val Loss: 2.0531


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s]


Epoch 3/10:
Train Loss: 1.9954 | Val Loss: 1.8959


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.67it/s]


Epoch 4/10:
Train Loss: 1.7856 | Val Loss: 1.6682


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.75it/s]


Epoch 5/10:
Train Loss: 1.5263 | Val Loss: 1.3254


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]


Epoch 6/10:
Train Loss: 1.2056 | Val Loss: 0.9685


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.85it/s]


Epoch 7/10:
Train Loss: 0.8913 | Val Loss: 0.7892


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s]


Epoch 8/10:
Train Loss: 0.6672 | Val Loss: 0.5339


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 9/10:
Train Loss: 0.4982 | Val Loss: 0.4292


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.72it/s]
[I 2025-05-09 13:36:26,741] Trial 42 finished with value: 0.3593586190388753 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.17748903069367078, 'learning_rate': 0.00019784183499008785, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3835 | Val Loss: 0.3594


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.75it/s]


Epoch 1/10:
Train Loss: 2.8514 | Val Loss: 2.3257


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 2/10:
Train Loss: 2.2280 | Val Loss: 2.0959


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.75it/s]


Epoch 3/10:
Train Loss: 1.9981 | Val Loss: 1.8289


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 4/10:
Train Loss: 1.6309 | Val Loss: 1.4106


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.82it/s]


Epoch 5/10:
Train Loss: 1.2081 | Val Loss: 1.0225


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.68it/s]


Epoch 6/10:
Train Loss: 0.8291 | Val Loss: 0.7422


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.73it/s]


Epoch 7/10:
Train Loss: 0.5924 | Val Loss: 0.5166


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 8/10:
Train Loss: 0.4663 | Val Loss: 0.4387


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]


Epoch 9/10:
Train Loss: 0.3628 | Val Loss: 0.4137


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.76it/s]
[I 2025-05-09 13:37:40,721] Trial 43 finished with value: 0.3670113877608226 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.15717338226784736, 'learning_rate': 0.00036289756150266163, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.3171 | Val Loss: 0.3670


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.84it/s]


Epoch 1/10:
Train Loss: 3.1100 | Val Loss: 2.9701


Training: 100%|██████████| 50/50 [00:06<00:00,  7.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.69it/s]


Epoch 2/10:
Train Loss: 3.0019 | Val Loss: 2.9708


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 3/10:
Train Loss: 2.9987 | Val Loss: 2.9726


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.85it/s]


Epoch 4/10:
Train Loss: 2.9916 | Val Loss: 2.9668


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]


Epoch 5/10:
Train Loss: 2.9912 | Val Loss: 2.9692


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.80it/s]


Epoch 6/10:
Train Loss: 3.0755 | Val Loss: 2.9707


Training: 100%|██████████| 50/50 [00:06<00:00,  7.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.68it/s]


Epoch 7/10:
Train Loss: 2.9889 | Val Loss: 2.9660


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.78it/s]


Epoch 8/10:
Train Loss: 2.9866 | Val Loss: 2.9704


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.57it/s]


Epoch 9/10:
Train Loss: 2.9822 | Val Loss: 3.0704


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]
[I 2025-05-09 13:38:54,666] Trial 44 finished with value: 2.96597145153926 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.20060275582711562, 'learning_rate': 0.0006752841232516713, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 2.9656 | Val Loss: 3.1664
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:04<00:00, 12.10it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.68it/s]


Epoch 1/10:
Train Loss: 2.7287 | Val Loss: 2.2923


Training: 100%|██████████| 50/50 [00:04<00:00, 12.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.82it/s]


Epoch 2/10:
Train Loss: 2.2508 | Val Loss: 2.1183


Training: 100%|██████████| 50/50 [00:04<00:00, 12.06it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.22it/s]


Epoch 3/10:
Train Loss: 2.1063 | Val Loss: 1.9944


Training: 100%|██████████| 50/50 [00:04<00:00, 12.04it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.56it/s]


Epoch 4/10:
Train Loss: 1.9407 | Val Loss: 1.7805


Training: 100%|██████████| 50/50 [00:04<00:00, 12.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.11it/s]


Epoch 5/10:
Train Loss: 1.7157 | Val Loss: 1.5376


Training: 100%|██████████| 50/50 [00:04<00:00, 12.03it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.86it/s]


Epoch 6/10:
Train Loss: 1.4867 | Val Loss: 1.2600


Training: 100%|██████████| 50/50 [00:04<00:00, 12.11it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.78it/s]


Epoch 7/10:
Train Loss: 1.2190 | Val Loss: 1.0055


Training: 100%|██████████| 50/50 [00:04<00:00, 12.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.98it/s]


Epoch 8/10:
Train Loss: 0.9881 | Val Loss: 0.7972


Training: 100%|██████████| 50/50 [00:04<00:00, 12.07it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.35it/s]


Epoch 9/10:
Train Loss: 0.7879 | Val Loss: 0.6570


Training: 100%|██████████| 50/50 [00:04<00:00, 12.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.87it/s]
[I 2025-05-09 13:39:39,305] Trial 45 finished with value: 0.4912764567595262 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.17993163153967587, 'learning_rate': 0.0004799814812883458, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.6285 | Val Loss: 0.4913


Training: 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.89it/s]


Epoch 1/10:
Train Loss: 3.1099 | Val Loss: 2.9808


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.85it/s]


Epoch 2/10:
Train Loss: 2.9951 | Val Loss: 2.9496


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.88it/s]


Epoch 3/10:
Train Loss: 2.9740 | Val Loss: 2.9047


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.81it/s]


Epoch 4/10:
Train Loss: 2.5188 | Val Loss: 2.2658


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.83it/s]


Epoch 5/10:
Train Loss: 2.2442 | Val Loss: 2.1697


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.84it/s]


Epoch 6/10:
Train Loss: 2.1223 | Val Loss: 2.0693


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.86it/s]


Epoch 7/10:
Train Loss: 2.0271 | Val Loss: 2.0011


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.86it/s]


Epoch 8/10:
Train Loss: 1.9875 | Val Loss: 1.9663


Training: 100%|██████████| 50/50 [00:13<00:00,  3.59it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.83it/s]


Epoch 9/10:
Train Loss: 1.8690 | Val Loss: 1.8940


Training: 100%|██████████| 50/50 [00:13<00:00,  3.58it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.87it/s]
[I 2025-05-09 13:42:11,439] Trial 46 finished with value: 1.8811398561184223 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.20688148256931863, 'learning_rate': 0.0002838267726793432, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.7832 | Val Loss: 1.8811


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.77it/s]


Epoch 1/10:
Train Loss: 2.6997 | Val Loss: 2.2055


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.85it/s]


Epoch 2/10:
Train Loss: 2.1785 | Val Loss: 2.0491


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.81it/s]


Epoch 3/10:
Train Loss: 2.0293 | Val Loss: 1.9270


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.67it/s]


Epoch 4/10:
Train Loss: 1.8566 | Val Loss: 1.7315


Training: 100%|██████████| 50/50 [00:06<00:00,  7.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.55it/s]


Epoch 5/10:
Train Loss: 1.6350 | Val Loss: 1.4630


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]


Epoch 6/10:
Train Loss: 1.3698 | Val Loss: 1.1683


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.85it/s]


Epoch 7/10:
Train Loss: 1.1090 | Val Loss: 0.8942


Training: 100%|██████████| 50/50 [00:06<00:00,  7.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.82it/s]


Epoch 8/10:
Train Loss: 0.8738 | Val Loss: 0.7445


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s]


Epoch 9/10:
Train Loss: 0.6856 | Val Loss: 0.5724


Training: 100%|██████████| 50/50 [00:06<00:00,  7.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 21.86it/s]
[I 2025-05-09 13:43:25,362] Trial 47 finished with value: 0.4880194618151738 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.22448093515690337, 'learning_rate': 0.00020269134875390755, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 0.5527 | Val Loss: 0.4880


Training: 100%|██████████| 50/50 [00:05<00:00,  9.59it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.77it/s]


Epoch 1/10:
Train Loss: 3.0601 | Val Loss: 2.4403


Training: 100%|██████████| 50/50 [00:05<00:00,  9.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.86it/s]


Epoch 2/10:
Train Loss: 2.3906 | Val Loss: 2.3037


Training: 100%|██████████| 50/50 [00:05<00:00,  9.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.89it/s]


Epoch 3/10:
Train Loss: 2.2724 | Val Loss: 2.2003


Training: 100%|██████████| 50/50 [00:05<00:00,  9.68it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 34.00it/s]


Epoch 4/10:
Train Loss: 2.1868 | Val Loss: 2.1027


Training: 100%|██████████| 50/50 [00:05<00:00,  9.82it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.61it/s]


Epoch 5/10:
Train Loss: 2.1048 | Val Loss: 2.0537


Training: 100%|██████████| 50/50 [00:05<00:00,  9.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 34.33it/s]


Epoch 6/10:
Train Loss: 2.0438 | Val Loss: 1.9821


Training: 100%|██████████| 50/50 [00:05<00:00,  9.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.66it/s]


Epoch 7/10:
Train Loss: 1.9845 | Val Loss: 1.9813


Training: 100%|██████████| 50/50 [00:05<00:00,  9.68it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 34.05it/s]


Epoch 8/10:
Train Loss: 1.9408 | Val Loss: 1.9251


Training: 100%|██████████| 50/50 [00:05<00:00,  9.56it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 34.09it/s]


Epoch 9/10:
Train Loss: 1.8927 | Val Loss: 1.8804


Training: 100%|██████████| 50/50 [00:05<00:00,  9.79it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 33.23it/s]
[I 2025-05-09 13:44:20,993] Trial 48 finished with value: 1.8476687027857854 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.15761009572910933, 'learning_rate': 0.00040372281292432635, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 10/10:
Train Loss: 1.8516 | Val Loss: 1.8477


Training: 100%|██████████| 50/50 [00:05<00:00,  8.36it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.76it/s]


Epoch 1/10:
Train Loss: 3.1394 | Val Loss: 2.9757


Training: 100%|██████████| 50/50 [00:05<00:00,  8.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.35it/s]


Epoch 2/10:
Train Loss: 3.0136 | Val Loss: 2.9818


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.41it/s]


Epoch 3/10:
Train Loss: 3.0018 | Val Loss: 2.9731


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.54it/s]


Epoch 4/10:
Train Loss: 2.9985 | Val Loss: 2.9735


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.44it/s]


Epoch 5/10:
Train Loss: 2.9947 | Val Loss: 2.9739


Training: 100%|██████████| 50/50 [00:05<00:00,  8.39it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.65it/s]


Epoch 6/10:
Train Loss: 2.9868 | Val Loss: 2.9686


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.76it/s]


Epoch 7/10:
Train Loss: 2.9857 | Val Loss: 3.0189


Training: 100%|██████████| 50/50 [00:05<00:00,  8.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.59it/s]


Epoch 8/10:
Train Loss: 2.9712 | Val Loss: 3.0893


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.43it/s]
[I 2025-05-09 13:45:19,664] Trial 49 finished with value: 2.968587490228506 and parameters: {'d_model': 512, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.39704343231189176, 'learning_rate': 0.0007598923290018808, 'batch_size': 32}. Best is trial 16 with value: 0.2741393790795253.


Epoch 9/10:
Train Loss: 2.9588 | Val Loss: 3.9448
Early stopping triggered!

Best trial:
  Validation Loss: 0.2741
  Params: 
    d_model: 512
    num_heads: 2
    num_layers: 6
    d_ff: 256
    dropout: 0.21983729110648678
    learning_rate: 0.0003134263502032441
    batch_size: 32


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 25.07it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.05it/s]



Final Evaluation:
Train Loss: 0.1557 | Val Loss: 0.2741
Train Accuracy: 0.9519 | Val Accuracy: 0.9217

Test Decryptions:
Input: 'Please decrypt the following using Caesar cipher: gfbs' | Output: 'ZERKNY SHE PRI MIG MORTAMING MY.' | Expected: 'fear' | ✗
Input: 'Please decrypt the following using Caesar cipher: dpnqvufs' | Output: 'ZERKNY SHE PRI MIG MORTABY. HAVER TO DEM.' | Expected: 'computer' | ✗
Input: 'Please decrypt the following using Caesar cipher:xibu' | Output: 'ZERKEB JE' | Expected: 'what' | ✗
