In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Preparation

def load_data(file_path):
    df = pd.read_excel(file_path)

    df = df[df['output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)

    if count_filtered > 2000:
        df = df.sample(n=2000, random_state=42)

    inputs = df['input'].tolist()
    outputs = df['output'].tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break
    return best_val_loss  # Return the best validation loss from this training run

# Global variables to track best model across all trials
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Hyperparameter Optimization with Optuna
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [128, 256, 512]),
        "num_heads": trial.suggest_categorical("num_heads", [2, 4, 8, 16]),
        "num_layers": trial.suggest_categorical("num_layers", [6, 8, 10, 12]),
        "d_ff": trial.suggest_categorical("d_ff", [256, 512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss for this configuration
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_mono_key_1.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('/content/Full_training_mono_one.xlsx')
    vocab = Vocabulary()
    max_length = 256  # Adjusted for longer sentences

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('/content/drive/MyDrive/best_mono_key_1.pth'))

    # Evaluate on full datasets
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher:xibu", "what")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cuda


[I 2025-05-09 12:35:10,533] A new study created in memory with name: no-name-de230b32-87ef-4e78-816b-621a26e25328
Training: 100%|██████████| 50/50 [00:10<00:00,  4.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.50it/s]


Epoch 1/10:
Train Loss: 2.8700 | Val Loss: 2.3192


Training: 100%|██████████| 50/50 [00:10<00:00,  4.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.53it/s]


Epoch 2/10:
Train Loss: 2.2138 | Val Loss: 2.0697


Training: 100%|██████████| 50/50 [00:10<00:00,  4.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.56it/s]


Epoch 3/10:
Train Loss: 2.0429 | Val Loss: 1.9272


Training: 100%|██████████| 50/50 [00:10<00:00,  4.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.55it/s]


Epoch 4/10:
Train Loss: 1.9430 | Val Loss: 1.8455


Training: 100%|██████████| 50/50 [00:10<00:00,  4.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.56it/s]


Epoch 5/10:
Train Loss: 1.8539 | Val Loss: 1.7363


Training: 100%|██████████| 50/50 [00:10<00:00,  4.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.56it/s]


Epoch 6/10:
Train Loss: 1.7437 | Val Loss: 1.6059


Training: 100%|██████████| 50/50 [00:10<00:00,  4.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.54it/s]


Epoch 7/10:
Train Loss: 1.5875 | Val Loss: 1.3927


Training: 100%|██████████| 50/50 [00:10<00:00,  4.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.50it/s]


Epoch 8/10:
Train Loss: 1.4265 | Val Loss: 1.2691


Training: 100%|██████████| 50/50 [00:10<00:00,  4.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.41it/s]


Epoch 9/10:
Train Loss: 1.3062 | Val Loss: 1.1090


Training: 100%|██████████| 50/50 [00:10<00:00,  4.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 16.53it/s]


Epoch 10/10:
Train Loss: 1.1924 | Val Loss: 0.9924


[I 2025-05-09 12:37:01,934] Trial 0 finished with value: 0.992371930525853 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.19252646588087544, 'learning_rate': 0.00010672067372427402, 'batch_size': 32}. Best is trial 0 with value: 0.992371930525853.


New best model found! Val Loss: 0.9924
Config: {'d_model': 256, 'num_heads': 16, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.19252646588087544, 'learning_rate': 0.00010672067372427402, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:04<00:00, 11.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.27it/s]


Epoch 1/10:
Train Loss: 3.1799 | Val Loss: 2.4916


Training: 100%|██████████| 50/50 [00:04<00:00, 12.16it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.72it/s]


Epoch 2/10:
Train Loss: 2.3660 | Val Loss: 2.1764


Training: 100%|██████████| 50/50 [00:04<00:00, 11.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.33it/s]


Epoch 3/10:
Train Loss: 2.1503 | Val Loss: 2.0101


Training: 100%|██████████| 50/50 [00:04<00:00, 12.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.89it/s]


Epoch 4/10:
Train Loss: 1.9811 | Val Loss: 1.7846


Training: 100%|██████████| 50/50 [00:04<00:00, 12.02it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.41it/s]


Epoch 5/10:
Train Loss: 1.7854 | Val Loss: 1.5472


Training: 100%|██████████| 50/50 [00:04<00:00, 12.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.32it/s]


Epoch 6/10:
Train Loss: 1.6016 | Val Loss: 1.3015


Training: 100%|██████████| 50/50 [00:04<00:00, 11.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 39.40it/s]


Epoch 7/10:
Train Loss: 1.4420 | Val Loss: 1.1434


Training: 100%|██████████| 50/50 [00:04<00:00, 12.18it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.00it/s]


Epoch 8/10:
Train Loss: 1.3072 | Val Loss: 1.0047


Training: 100%|██████████| 50/50 [00:04<00:00, 11.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.92it/s]


Epoch 9/10:
Train Loss: 1.1786 | Val Loss: 0.8300


Training: 100%|██████████| 50/50 [00:04<00:00, 12.02it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.26it/s]
[I 2025-05-09 12:37:46,912] Trial 1 finished with value: 0.7493924819506131 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.20140947311797008, 'learning_rate': 0.0002546183452891027, 'batch_size': 32}. Best is trial 1 with value: 0.7493924819506131.


Epoch 10/10:
Train Loss: 1.0750 | Val Loss: 0.7494
New best model found! Val Loss: 0.7494
Config: {'d_model': 128, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.20140947311797008, 'learning_rate': 0.0002546183452891027, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:03<00:00, 14.80it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.56it/s]


Epoch 1/10:
Train Loss: 2.9318 | Val Loss: 2.3828


Training: 100%|██████████| 50/50 [00:03<00:00, 14.83it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.89it/s]


Epoch 2/10:
Train Loss: 2.2931 | Val Loss: 2.1576


Training: 100%|██████████| 50/50 [00:03<00:00, 14.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.38it/s]


Epoch 3/10:
Train Loss: 2.1031 | Val Loss: 1.9642


Training: 100%|██████████| 50/50 [00:03<00:00, 14.76it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.17it/s]


Epoch 4/10:
Train Loss: 1.8337 | Val Loss: 1.5191


Training: 100%|██████████| 50/50 [00:03<00:00, 14.49it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.94it/s]


Epoch 5/10:
Train Loss: 1.5035 | Val Loss: 1.2301


Training: 100%|██████████| 50/50 [00:03<00:00, 14.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.02it/s]


Epoch 6/10:
Train Loss: 1.2432 | Val Loss: 0.9291


Training: 100%|██████████| 50/50 [00:03<00:00, 14.90it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.79it/s]


Epoch 7/10:
Train Loss: 1.0557 | Val Loss: 0.6986


Training: 100%|██████████| 50/50 [00:03<00:00, 14.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.15it/s]


Epoch 8/10:
Train Loss: 0.9063 | Val Loss: 0.5917


Training: 100%|██████████| 50/50 [00:03<00:00, 14.82it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 49.30it/s]


Epoch 9/10:
Train Loss: 0.7943 | Val Loss: 0.5300


Training: 100%|██████████| 50/50 [00:03<00:00, 14.76it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 50.00it/s]
[I 2025-05-09 12:38:23,602] Trial 2 finished with value: 0.47525370808748096 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2199004951707324, 'learning_rate': 0.0007729262990210787, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 10/10:
Train Loss: 0.7103 | Val Loss: 0.4753
New best model found! Val Loss: 0.4753
Config: {'d_model': 128, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2199004951707324, 'learning_rate': 0.0007729262990210787, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.43it/s]


Epoch 1/10:
Train Loss: 3.2118 | Val Loss: 3.0061


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.46it/s]


Epoch 2/10:
Train Loss: 3.0048 | Val Loss: 2.9922


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.43it/s]


Epoch 3/10:
Train Loss: 2.9940 | Val Loss: 2.9911


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.43it/s]


Epoch 4/10:
Train Loss: 2.9881 | Val Loss: 2.9926


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.41it/s]


Epoch 5/10:
Train Loss: 2.9825 | Val Loss: 2.9847


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.44it/s]


Epoch 6/10:
Train Loss: 2.9789 | Val Loss: 2.9830


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.44it/s]


Epoch 7/10:
Train Loss: 2.9769 | Val Loss: 2.9838


Training: 100%|██████████| 50/50 [00:11<00:00,  4.44it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.43it/s]


Epoch 8/10:
Train Loss: 2.9765 | Val Loss: 2.9840


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.44it/s]


Epoch 9/10:
Train Loss: 2.9765 | Val Loss: 2.9817


Training: 100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.43it/s]
[I 2025-05-09 12:40:26,026] Trial 3 finished with value: 2.981670489678016 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.12034323071170186, 'learning_rate': 0.003460416943792857, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 10/10:
Train Loss: 2.9759 | Val Loss: 2.9822


Training: 100%|██████████| 50/50 [00:05<00:00,  9.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.85it/s]


Epoch 1/10:
Train Loss: 3.0856 | Val Loss: 3.0033


Training: 100%|██████████| 50/50 [00:05<00:00,  9.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.02it/s]


Epoch 2/10:
Train Loss: 2.9878 | Val Loss: 3.0008


Training: 100%|██████████| 50/50 [00:05<00:00,  8.99it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.86it/s]


Epoch 3/10:
Train Loss: 2.9818 | Val Loss: 2.9949


Training: 100%|██████████| 50/50 [00:05<00:00,  9.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.39it/s]


Epoch 4/10:
Train Loss: 2.9794 | Val Loss: 2.9933


Training: 100%|██████████| 50/50 [00:05<00:00,  9.21it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.84it/s]


Epoch 5/10:
Train Loss: 2.9792 | Val Loss: 2.9946


Training: 100%|██████████| 50/50 [00:05<00:00,  9.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 26.02it/s]


Epoch 6/10:
Train Loss: 2.9788 | Val Loss: 2.9849


Training: 100%|██████████| 50/50 [00:05<00:00,  9.20it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.81it/s]


Epoch 7/10:
Train Loss: 2.9785 | Val Loss: 2.9839


Training: 100%|██████████| 50/50 [00:05<00:00,  9.22it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.74it/s]


Epoch 8/10:
Train Loss: 2.9765 | Val Loss: 2.9836


Training: 100%|██████████| 50/50 [00:05<00:00,  9.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.69it/s]


Epoch 9/10:
Train Loss: 2.9769 | Val Loss: 2.9857


Training: 100%|██████████| 50/50 [00:05<00:00,  9.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 25.90it/s]
[I 2025-05-09 12:41:25,682] Trial 4 finished with value: 2.9836441370157094 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.2669365664281005, 'learning_rate': 0.009842588166991358, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 10/10:
Train Loss: 2.9762 | Val Loss: 2.9908


Training: 100%|██████████| 50/50 [00:07<00:00,  6.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.15it/s]


Epoch 1/10:
Train Loss: 3.1155 | Val Loss: 2.9967


Training: 100%|██████████| 50/50 [00:07<00:00,  6.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.04it/s]


Epoch 2/10:
Train Loss: 2.9986 | Val Loss: 2.9845


Training: 100%|██████████| 50/50 [00:07<00:00,  6.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.83it/s]


Epoch 3/10:
Train Loss: 2.9921 | Val Loss: 2.9831


Training: 100%|██████████| 50/50 [00:07<00:00,  6.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.09it/s]


Epoch 4/10:
Train Loss: 2.9898 | Val Loss: 2.9845


Training: 100%|██████████| 50/50 [00:07<00:00,  6.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.06it/s]


Epoch 5/10:
Train Loss: 2.9850 | Val Loss: 2.9867


Training: 100%|██████████| 50/50 [00:07<00:00,  6.43it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.04it/s]


Epoch 6/10:
Train Loss: 2.9842 | Val Loss: 2.9817


Training: 100%|██████████| 50/50 [00:07<00:00,  6.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.08it/s]


Epoch 7/10:
Train Loss: 2.9806 | Val Loss: 3.0087


Training: 100%|██████████| 50/50 [00:07<00:00,  6.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.12it/s]


Epoch 8/10:
Train Loss: 2.9679 | Val Loss: 3.1108


Training: 100%|██████████| 50/50 [00:07<00:00,  6.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.08it/s]
[I 2025-05-09 12:42:42,403] Trial 5 finished with value: 2.981669829441951 and parameters: {'d_model': 512, 'num_heads': 4, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.21281529558765225, 'learning_rate': 0.0007625665351575371, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 9/10:
Train Loss: 2.9521 | Val Loss: 3.3982
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:07<00:00,  6.90it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.99it/s]


Epoch 1/10:
Train Loss: 3.1924 | Val Loss: 2.6445


Training: 100%|██████████| 50/50 [00:07<00:00,  6.88it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.96it/s]


Epoch 2/10:
Train Loss: 2.4937 | Val Loss: 2.3269


Training: 100%|██████████| 50/50 [00:07<00:00,  6.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.94it/s]


Epoch 3/10:
Train Loss: 2.2596 | Val Loss: 2.1543


Training: 100%|██████████| 50/50 [00:07<00:00,  6.90it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.03it/s]


Epoch 4/10:
Train Loss: 2.1194 | Val Loss: 2.0370


Training: 100%|██████████| 50/50 [00:07<00:00,  6.88it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.01it/s]


Epoch 5/10:
Train Loss: 2.0213 | Val Loss: 1.9411


Training: 100%|██████████| 50/50 [00:07<00:00,  6.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.81it/s]


Epoch 6/10:
Train Loss: 1.9374 | Val Loss: 1.8557


Training: 100%|██████████| 50/50 [00:07<00:00,  6.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.02it/s]


Epoch 7/10:
Train Loss: 1.8511 | Val Loss: 1.7816


Training: 100%|██████████| 50/50 [00:07<00:00,  6.90it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.11it/s]


Epoch 8/10:
Train Loss: 1.7815 | Val Loss: 1.7116


Training: 100%|██████████| 50/50 [00:07<00:00,  6.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 19.97it/s]


Epoch 9/10:
Train Loss: 1.6967 | Val Loss: 1.5936


Training: 100%|██████████| 50/50 [00:07<00:00,  6.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.06it/s]
[I 2025-05-09 12:44:01,665] Trial 6 finished with value: 1.5124260095449595 and parameters: {'d_model': 128, 'num_heads': 16, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11268993017884771, 'learning_rate': 0.000124102770335263, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 10/10:
Train Loss: 1.6053 | Val Loss: 1.5124


Training: 100%|██████████| 50/50 [00:04<00:00, 10.09it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.97it/s]


Epoch 1/10:
Train Loss: 3.0993 | Val Loss: 2.9889


Training: 100%|██████████| 50/50 [00:04<00:00, 10.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 31.30it/s]


Epoch 2/10:
Train Loss: 2.9910 | Val Loss: 2.9839


Training: 100%|██████████| 50/50 [00:04<00:00, 10.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.47it/s]


Epoch 3/10:
Train Loss: 2.9859 | Val Loss: 2.9889


Training: 100%|██████████| 50/50 [00:04<00:00, 10.19it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 31.15it/s]


Epoch 4/10:
Train Loss: 2.9769 | Val Loss: 3.1017


Training: 100%|██████████| 50/50 [00:04<00:00, 10.23it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.46it/s]
[I 2025-05-09 12:44:28,507] Trial 7 finished with value: 2.9839000151707578 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.23263862615128364, 'learning_rate': 0.0016995014964641094, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 5/10:
Train Loss: 2.9614 | Val Loss: 3.1251
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:03<00:00, 14.53it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.33it/s]


Epoch 1/10:
Train Loss: 3.1126 | Val Loss: 2.9850


Training: 100%|██████████| 50/50 [00:03<00:00, 15.00it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.26it/s]


Epoch 2/10:
Train Loss: 2.8656 | Val Loss: 2.5034


Training: 100%|██████████| 50/50 [00:03<00:00, 14.99it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 47.66it/s]


Epoch 3/10:
Train Loss: 2.3784 | Val Loss: 2.3067


Training: 100%|██████████| 50/50 [00:03<00:00, 14.54it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.03it/s]


Epoch 4/10:
Train Loss: 2.2595 | Val Loss: 2.2134


Training: 100%|██████████| 50/50 [00:03<00:00, 14.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.21it/s]


Epoch 5/10:
Train Loss: 2.1745 | Val Loss: 2.1442


Training: 100%|██████████| 50/50 [00:03<00:00, 14.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.39it/s]


Epoch 6/10:
Train Loss: 2.1073 | Val Loss: 2.0803


Training: 100%|██████████| 50/50 [00:03<00:00, 14.52it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 47.74it/s]


Epoch 7/10:
Train Loss: 2.0525 | Val Loss: 2.0376


Training: 100%|██████████| 50/50 [00:03<00:00, 14.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.10it/s]


Epoch 8/10:
Train Loss: 1.9983 | Val Loss: 2.0044


Training: 100%|██████████| 50/50 [00:03<00:00, 14.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.15it/s]


Epoch 9/10:
Train Loss: 1.9543 | Val Loss: 1.9450


Training: 100%|██████████| 50/50 [00:03<00:00, 14.69it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 48.06it/s]
[I 2025-05-09 12:45:05,175] Trial 8 finished with value: 1.9172807014905489 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.18295918095240976, 'learning_rate': 0.0012567899375324643, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 10/10:
Train Loss: 1.9121 | Val Loss: 1.9173


Training: 100%|██████████| 50/50 [00:11<00:00,  4.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.57it/s]


Epoch 1/10:
Train Loss: 3.1190 | Val Loss: 3.0107


Training: 100%|██████████| 50/50 [00:11<00:00,  4.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.50it/s]


Epoch 2/10:
Train Loss: 2.9943 | Val Loss: 3.0110


Training: 100%|██████████| 50/50 [00:11<00:00,  4.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.64it/s]


Epoch 3/10:
Train Loss: 2.9846 | Val Loss: 3.0032


Training: 100%|██████████| 50/50 [00:11<00:00,  4.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.56it/s]


Epoch 4/10:
Train Loss: 2.9799 | Val Loss: 2.9939


Training: 100%|██████████| 50/50 [00:11<00:00,  4.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.64it/s]


Epoch 5/10:
Train Loss: 2.9789 | Val Loss: 2.9981


Training: 100%|██████████| 50/50 [00:11<00:00,  4.38it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.55it/s]


Epoch 6/10:
Train Loss: 2.9763 | Val Loss: 2.9948


Training: 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.59it/s]


Epoch 7/10:
Train Loss: 2.9767 | Val Loss: 2.9930


Training: 100%|██████████| 50/50 [00:11<00:00,  4.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.60it/s]


Epoch 8/10:
Train Loss: 2.9755 | Val Loss: 3.0035


Training: 100%|██████████| 50/50 [00:11<00:00,  4.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.63it/s]


Epoch 9/10:
Train Loss: 2.9757 | Val Loss: 2.9917


Training: 100%|██████████| 50/50 [00:11<00:00,  4.37it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.61it/s]
[I 2025-05-09 12:47:07,451] Trial 9 finished with value: 2.9851812032552867 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.30457691578381635, 'learning_rate': 0.00962973612470409, 'batch_size': 32}. Best is trial 2 with value: 0.47525370808748096.


Epoch 10/10:
Train Loss: 2.9745 | Val Loss: 2.9852


Training: 100%|██████████| 50/50 [00:04<00:00, 10.87it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.72it/s]


Epoch 1/10:
Train Loss: 2.7769 | Val Loss: 2.2074


Training: 100%|██████████| 50/50 [00:04<00:00, 10.90it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.48it/s]


Epoch 2/10:
Train Loss: 2.1358 | Val Loss: 1.9070


Training: 100%|██████████| 50/50 [00:04<00:00, 10.90it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.18it/s]


Epoch 3/10:
Train Loss: 1.8453 | Val Loss: 1.4452


Training: 100%|██████████| 50/50 [00:04<00:00, 10.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.88it/s]


Epoch 4/10:
Train Loss: 1.5167 | Val Loss: 1.0666


Training: 100%|██████████| 50/50 [00:04<00:00, 10.85it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 41.83it/s]


Epoch 5/10:
Train Loss: 1.2607 | Val Loss: 0.8783


Training: 100%|██████████| 50/50 [00:04<00:00, 10.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.87it/s]


Epoch 6/10:
Train Loss: 1.0735 | Val Loss: 0.7008


Training: 100%|██████████| 50/50 [00:04<00:00, 10.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.36it/s]


Epoch 7/10:
Train Loss: 0.9335 | Val Loss: 0.5809


Training: 100%|██████████| 50/50 [00:04<00:00, 10.90it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.60it/s]


Epoch 8/10:
Train Loss: 0.8190 | Val Loss: 0.5563


Training: 100%|██████████| 50/50 [00:04<00:00, 10.91it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.83it/s]


Epoch 9/10:
Train Loss: 0.7288 | Val Loss: 0.4942


Training: 100%|██████████| 50/50 [00:04<00:00, 10.89it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 42.69it/s]
[I 2025-05-09 12:47:56,750] Trial 10 finished with value: 0.4224349627128014 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.36881404613434304, 'learning_rate': 0.00046243567752343257, 'batch_size': 32}. Best is trial 10 with value: 0.4224349627128014.


Epoch 10/10:
Train Loss: 0.6695 | Val Loss: 0.4224
New best model found! Val Loss: 0.4224
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.36881404613434304, 'learning_rate': 0.00046243567752343257, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.59it/s]


Epoch 1/10:
Train Loss: 2.7550 | Val Loss: 2.2175


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.30it/s]


Epoch 2/10:
Train Loss: 2.1559 | Val Loss: 1.9738


Training: 100%|██████████| 50/50 [00:04<00:00, 10.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.89it/s]


Epoch 3/10:
Train Loss: 1.9106 | Val Loss: 1.6421


Training: 100%|██████████| 50/50 [00:04<00:00, 10.77it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.32it/s]


Epoch 4/10:
Train Loss: 1.6501 | Val Loss: 1.3001


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.44it/s]


Epoch 5/10:
Train Loss: 1.4140 | Val Loss: 1.0001


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.60it/s]


Epoch 6/10:
Train Loss: 1.2304 | Val Loss: 0.8516


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 39.94it/s]


Epoch 7/10:
Train Loss: 1.0805 | Val Loss: 0.8364


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.62it/s]


Epoch 8/10:
Train Loss: 0.9604 | Val Loss: 0.6277


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.31it/s]


Epoch 9/10:
Train Loss: 0.8607 | Val Loss: 0.5869


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.69it/s]
[I 2025-05-09 12:48:46,699] Trial 11 finished with value: 0.5024304940150335 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.39540374314296023, 'learning_rate': 0.00043819614152972985, 'batch_size': 32}. Best is trial 10 with value: 0.4224349627128014.


Epoch 10/10:
Train Loss: 0.7845 | Val Loss: 0.5024


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.44it/s]


Epoch 1/10:
Train Loss: 2.7361 | Val Loss: 2.1787


Training: 100%|██████████| 50/50 [00:04<00:00, 10.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 39.82it/s]


Epoch 2/10:
Train Loss: 2.1128 | Val Loss: 1.8825


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.75it/s]


Epoch 3/10:
Train Loss: 1.8320 | Val Loss: 1.4755


Training: 100%|██████████| 50/50 [00:04<00:00, 10.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.54it/s]


Epoch 4/10:
Train Loss: 1.5186 | Val Loss: 1.1045


Training: 100%|██████████| 50/50 [00:04<00:00, 10.76it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.93it/s]


Epoch 5/10:
Train Loss: 1.2764 | Val Loss: 0.8799


Training: 100%|██████████| 50/50 [00:04<00:00, 10.76it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.80it/s]


Epoch 6/10:
Train Loss: 1.0764 | Val Loss: 0.6737


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.65it/s]


Epoch 7/10:
Train Loss: 0.9300 | Val Loss: 0.6678


Training: 100%|██████████| 50/50 [00:04<00:00, 10.77it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.92it/s]


Epoch 8/10:
Train Loss: 0.8219 | Val Loss: 0.5711


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.29it/s]


Epoch 9/10:
Train Loss: 0.7279 | Val Loss: 0.4379


Training: 100%|██████████| 50/50 [00:04<00:00, 10.77it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.63it/s]
[I 2025-05-09 12:49:36,620] Trial 12 finished with value: 0.43794556993704575 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.36092934741432736, 'learning_rate': 0.0004569154787405004, 'batch_size': 32}. Best is trial 10 with value: 0.4224349627128014.


Epoch 10/10:
Train Loss: 0.6575 | Val Loss: 0.4582


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 39.81it/s]


Epoch 1/10:
Train Loss: 2.9023 | Val Loss: 2.2793


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.78it/s]


Epoch 2/10:
Train Loss: 2.2139 | Val Loss: 2.0184


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.36it/s]


Epoch 3/10:
Train Loss: 2.0160 | Val Loss: 1.8298


Training: 100%|██████████| 50/50 [00:04<00:00, 10.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 39.88it/s]


Epoch 4/10:
Train Loss: 1.8041 | Val Loss: 1.5117


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.63it/s]


Epoch 5/10:
Train Loss: 1.5833 | Val Loss: 1.2049


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.35it/s]


Epoch 6/10:
Train Loss: 1.3937 | Val Loss: 1.0185


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.75it/s]


Epoch 7/10:
Train Loss: 1.2368 | Val Loss: 0.8709


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.83it/s]


Epoch 8/10:
Train Loss: 1.1175 | Val Loss: 0.7371


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.91it/s]


Epoch 9/10:
Train Loss: 1.0112 | Val Loss: 0.6895


Training: 100%|██████████| 50/50 [00:04<00:00, 10.76it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.55it/s]
[I 2025-05-09 12:50:26,566] Trial 13 finished with value: 0.6070177417535049 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.376438951086013, 'learning_rate': 0.0002824417233368042, 'batch_size': 32}. Best is trial 10 with value: 0.4224349627128014.


Epoch 10/10:
Train Loss: 0.9088 | Val Loss: 0.6070


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.78it/s]


Epoch 1/10:
Train Loss: 2.7775 | Val Loss: 2.1941


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.88it/s]


Epoch 2/10:
Train Loss: 2.1060 | Val Loss: 1.8434


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.74it/s]


Epoch 3/10:
Train Loss: 1.7840 | Val Loss: 1.3860


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.56it/s]


Epoch 4/10:
Train Loss: 1.4722 | Val Loss: 1.0742


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.74it/s]


Epoch 5/10:
Train Loss: 1.2050 | Val Loss: 0.8470


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.61it/s]


Epoch 6/10:
Train Loss: 1.0200 | Val Loss: 0.6809


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.74it/s]


Epoch 7/10:
Train Loss: 0.8783 | Val Loss: 0.5663


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.38it/s]


Epoch 8/10:
Train Loss: 0.7651 | Val Loss: 0.5262


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.57it/s]


Epoch 9/10:
Train Loss: 0.6909 | Val Loss: 0.4251


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.62it/s]
[I 2025-05-09 12:51:16,647] Trial 14 finished with value: 0.3686918444358386 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.344013713726708, 'learning_rate': 0.0004291111007123159, 'batch_size': 32}. Best is trial 14 with value: 0.3686918444358386.


Epoch 10/10:
Train Loss: 0.6029 | Val Loss: 0.3687
New best model found! Val Loss: 0.3687
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.344013713726708, 'learning_rate': 0.0004291111007123159, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.46it/s]


Epoch 1/10:
Train Loss: 2.9535 | Val Loss: 2.3075


Training: 100%|██████████| 50/50 [00:04<00:00, 10.74it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.21it/s]


Epoch 2/10:
Train Loss: 2.2354 | Val Loss: 2.0587


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.65it/s]


Epoch 3/10:
Train Loss: 2.0456 | Val Loss: 1.8712


Training: 100%|██████████| 50/50 [00:04<00:00, 10.76it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.69it/s]


Epoch 4/10:
Train Loss: 1.8484 | Val Loss: 1.5958


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.17it/s]


Epoch 5/10:
Train Loss: 1.6468 | Val Loss: 1.3208


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.52it/s]


Epoch 6/10:
Train Loss: 1.4442 | Val Loss: 1.0712


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.26it/s]


Epoch 7/10:
Train Loss: 1.2769 | Val Loss: 0.9166


Training: 100%|██████████| 50/50 [00:04<00:00, 10.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.56it/s]


Epoch 8/10:
Train Loss: 1.1477 | Val Loss: 0.8009


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.28it/s]


Epoch 9/10:
Train Loss: 1.0188 | Val Loss: 0.6846


Training: 100%|██████████| 50/50 [00:04<00:00, 10.75it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.55it/s]
[I 2025-05-09 12:52:06,573] Trial 15 finished with value: 0.6247334778308868 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.3388244727305079, 'learning_rate': 0.00021766489138025122, 'batch_size': 32}. Best is trial 14 with value: 0.3686918444358386.


Epoch 10/10:
Train Loss: 0.9208 | Val Loss: 0.6247


Training: 100%|██████████| 50/50 [00:04<00:00, 11.00it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.45it/s]


Epoch 1/10:
Train Loss: 3.0962 | Val Loss: 2.9953


Training: 100%|██████████| 50/50 [00:04<00:00, 10.99it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.49it/s]


Epoch 2/10:
Train Loss: 2.9960 | Val Loss: 2.9902


Training: 100%|██████████| 50/50 [00:04<00:00, 10.99it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.60it/s]


Epoch 3/10:
Train Loss: 2.9864 | Val Loss: 2.9862


Training: 100%|██████████| 50/50 [00:04<00:00, 10.99it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.57it/s]


Epoch 4/10:
Train Loss: 2.9824 | Val Loss: 2.9881


Training: 100%|██████████| 50/50 [00:04<00:00, 10.99it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.37it/s]


Epoch 5/10:
Train Loss: 2.9806 | Val Loss: 2.9855


Training: 100%|██████████| 50/50 [00:04<00:00, 11.01it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.64it/s]


Epoch 6/10:
Train Loss: 2.9781 | Val Loss: 2.9784


Training: 100%|██████████| 50/50 [00:04<00:00, 10.98it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.26it/s]


Epoch 7/10:
Train Loss: 2.9726 | Val Loss: 3.0504


Training: 100%|██████████| 50/50 [00:04<00:00, 11.00it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.47it/s]


Epoch 8/10:
Train Loss: 2.9561 | Val Loss: 3.4239


Training: 100%|██████████| 50/50 [00:04<00:00, 10.98it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 38.38it/s]
[I 2025-05-09 12:52:50,725] Trial 16 finished with value: 2.9784118028787465 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.3136258816730022, 'learning_rate': 0.002286315709357293, 'batch_size': 32}. Best is trial 14 with value: 0.3686918444358386.


Epoch 9/10:
Train Loss: 2.9445 | Val Loss: 3.4084
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:04<00:00, 10.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 39.92it/s]


Epoch 1/10:
Train Loss: 2.6558 | Val Loss: 2.1254


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.16it/s]


Epoch 2/10:
Train Loss: 2.0117 | Val Loss: 1.7691


Training: 100%|██████████| 50/50 [00:04<00:00, 10.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 39.88it/s]


Epoch 3/10:
Train Loss: 1.6396 | Val Loss: 1.2246


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.39it/s]


Epoch 4/10:
Train Loss: 1.2376 | Val Loss: 0.8891


Training: 100%|██████████| 50/50 [00:04<00:00, 10.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.17it/s]


Epoch 5/10:
Train Loss: 0.9869 | Val Loss: 0.6847


Training: 100%|██████████| 50/50 [00:04<00:00, 10.68it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.11it/s]


Epoch 6/10:
Train Loss: 0.8129 | Val Loss: 0.5152


Training: 100%|██████████| 50/50 [00:04<00:00, 10.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.02it/s]


Epoch 7/10:
Train Loss: 0.6624 | Val Loss: 0.3971


Training: 100%|██████████| 50/50 [00:04<00:00, 10.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.25it/s]


Epoch 8/10:
Train Loss: 0.5707 | Val Loss: 0.3642


Training: 100%|██████████| 50/50 [00:04<00:00, 10.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.19it/s]


Epoch 9/10:
Train Loss: 0.4904 | Val Loss: 0.3642


Training: 100%|██████████| 50/50 [00:04<00:00, 10.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 40.28it/s]
[I 2025-05-09 12:53:40,987] Trial 17 finished with value: 0.29550619652638066 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2722300889779533, 'learning_rate': 0.0004354216405735617, 'batch_size': 32}. Best is trial 17 with value: 0.29550619652638066.


Epoch 10/10:
Train Loss: 0.4399 | Val Loss: 0.2955
New best model found! Val Loss: 0.2955
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2722300889779533, 'learning_rate': 0.0004354216405735617, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.80it/s]


Epoch 1/10:
Train Loss: 2.6260 | Val Loss: 2.0848


Training: 100%|██████████| 50/50 [00:05<00:00,  8.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.71it/s]


Epoch 2/10:
Train Loss: 1.9141 | Val Loss: 1.5863


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.83it/s]


Epoch 3/10:
Train Loss: 1.4692 | Val Loss: 1.0832


Training: 100%|██████████| 50/50 [00:05<00:00,  8.43it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.71it/s]


Epoch 4/10:
Train Loss: 1.0743 | Val Loss: 0.7237


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.75it/s]


Epoch 5/10:
Train Loss: 0.8253 | Val Loss: 0.6261


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.64it/s]


Epoch 6/10:
Train Loss: 0.6445 | Val Loss: 0.4960


Training: 100%|██████████| 50/50 [00:05<00:00,  8.41it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.78it/s]


Epoch 7/10:
Train Loss: 0.5189 | Val Loss: 0.3991


Training: 100%|██████████| 50/50 [00:05<00:00,  8.42it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.86it/s]


Epoch 8/10:
Train Loss: 0.4323 | Val Loss: 0.3511


Training: 100%|██████████| 50/50 [00:05<00:00,  8.44it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.88it/s]


Epoch 9/10:
Train Loss: 0.3687 | Val Loss: 0.2913


Training: 100%|██████████| 50/50 [00:05<00:00,  8.40it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 24.67it/s]


Epoch 10/10:
Train Loss: 0.3238 | Val Loss: 0.2308


[I 2025-05-09 12:54:46,199] Trial 18 finished with value: 0.23081068350718573 and parameters: {'d_model': 512, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2809928536387791, 'learning_rate': 0.0001889553801861348, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


New best model found! Val Loss: 0.2308
Config: {'d_model': 512, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2809928536387791, 'learning_rate': 0.0001889553801861348, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.60it/s]


Epoch 1/10:
Train Loss: 2.6192 | Val Loss: 2.1231


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.60it/s]


Epoch 2/10:
Train Loss: 2.0053 | Val Loss: 1.7981


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.61it/s]


Epoch 3/10:
Train Loss: 1.6744 | Val Loss: 1.2962


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.63it/s]


Epoch 4/10:
Train Loss: 1.2996 | Val Loss: 0.9456


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 5/10:
Train Loss: 1.0068 | Val Loss: 0.7397


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 6/10:
Train Loss: 0.8158 | Val Loss: 0.5595


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.62it/s]


Epoch 7/10:
Train Loss: 0.6603 | Val Loss: 0.4775


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.61it/s]


Epoch 8/10:
Train Loss: 0.5569 | Val Loss: 0.4455


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.48it/s]


Epoch 9/10:
Train Loss: 0.4896 | Val Loss: 0.3684


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.55it/s]
[I 2025-05-09 12:56:04,827] Trial 19 finished with value: 0.2819112115181409 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.27287210483799806, 'learning_rate': 0.0001449331101362119, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.4216 | Val Loss: 0.2819


Training: 100%|██████████| 50/50 [00:14<00:00,  3.56it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.83it/s]


Epoch 1/10:
Train Loss: 3.0647 | Val Loss: 2.4474


Training: 100%|██████████| 50/50 [00:14<00:00,  3.56it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.84it/s]


Epoch 2/10:
Train Loss: 2.1972 | Val Loss: 2.0291


Training: 100%|██████████| 50/50 [00:14<00:00,  3.56it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.86it/s]


Epoch 3/10:
Train Loss: 1.9221 | Val Loss: 1.8000


Training: 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.85it/s]


Epoch 4/10:
Train Loss: 1.6012 | Val Loss: 1.3451


Training: 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.83it/s]


Epoch 5/10:
Train Loss: 1.1702 | Val Loss: 0.8339


Training: 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.82it/s]


Epoch 6/10:
Train Loss: 0.8474 | Val Loss: 0.6789


Training: 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.84it/s]


Epoch 7/10:
Train Loss: 0.6660 | Val Loss: 0.5339


Training: 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.85it/s]


Epoch 8/10:
Train Loss: 0.5513 | Val Loss: 0.4418


Training: 100%|██████████| 50/50 [00:14<00:00,  3.57it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.85it/s]


Epoch 9/10:
Train Loss: 0.4801 | Val Loss: 0.3921


Training: 100%|██████████| 50/50 [00:14<00:00,  3.56it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 10.85it/s]
[I 2025-05-09 12:58:37,652] Trial 20 finished with value: 0.3172044146519441 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.16033535841645047, 'learning_rate': 0.00015826921106148334, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.3875 | Val Loss: 0.3172


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.67it/s]


Epoch 1/10:
Train Loss: 2.6380 | Val Loss: 2.0722


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.64it/s]


Epoch 2/10:
Train Loss: 1.9492 | Val Loss: 1.6765


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.63it/s]


Epoch 3/10:
Train Loss: 1.5301 | Val Loss: 1.1444


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.58it/s]


Epoch 4/10:
Train Loss: 1.1228 | Val Loss: 0.8099


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.53it/s]


Epoch 5/10:
Train Loss: 0.8687 | Val Loss: 0.6213


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 6/10:
Train Loss: 0.6858 | Val Loss: 0.5627


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.49it/s]


Epoch 7/10:
Train Loss: 0.5684 | Val Loss: 0.4196


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.65it/s]


Epoch 8/10:
Train Loss: 0.4772 | Val Loss: 0.3746


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 9/10:
Train Loss: 0.4217 | Val Loss: 0.3827


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.53it/s]
[I 2025-05-09 12:59:56,222] Trial 21 finished with value: 0.2520057793993216 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.2725797596397792, 'learning_rate': 0.00017577802490062686, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.3644 | Val Loss: 0.2520


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 1/10:
Train Loss: 2.6338 | Val Loss: 2.0801


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.48it/s]


Epoch 2/10:
Train Loss: 1.9811 | Val Loss: 1.7838


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.48it/s]


Epoch 3/10:
Train Loss: 1.6318 | Val Loss: 1.2800


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.40it/s]


Epoch 4/10:
Train Loss: 1.2358 | Val Loss: 0.9143


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.61it/s]


Epoch 5/10:
Train Loss: 0.9657 | Val Loss: 0.6962


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.61it/s]


Epoch 6/10:
Train Loss: 0.7728 | Val Loss: 0.5523


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.68it/s]


Epoch 7/10:
Train Loss: 0.6358 | Val Loss: 0.4657


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.53it/s]


Epoch 8/10:
Train Loss: 0.5367 | Val Loss: 0.3790


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.65it/s]


Epoch 9/10:
Train Loss: 0.4578 | Val Loss: 0.3330


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.73it/s]
[I 2025-05-09 13:01:14,845] Trial 22 finished with value: 0.3110747371728604 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.2909354220270307, 'learning_rate': 0.0001665307215199937, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.3925 | Val Loss: 0.3111


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.47it/s]


Epoch 1/10:
Train Loss: 2.5765 | Val Loss: 2.0358


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.58it/s]


Epoch 2/10:
Train Loss: 1.9120 | Val Loss: 1.5905


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.49it/s]


Epoch 3/10:
Train Loss: 1.4343 | Val Loss: 1.0615


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.66it/s]


Epoch 4/10:
Train Loss: 1.0531 | Val Loss: 0.7637


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.57it/s]


Epoch 5/10:
Train Loss: 0.7992 | Val Loss: 0.5719


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.67it/s]


Epoch 6/10:
Train Loss: 0.6063 | Val Loss: 0.4343


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.56it/s]


Epoch 7/10:
Train Loss: 0.4951 | Val Loss: 0.3925


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 8/10:
Train Loss: 0.4204 | Val Loss: 0.3204


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.62it/s]


Epoch 9/10:
Train Loss: 0.3676 | Val Loss: 0.2425


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.34it/s]
[I 2025-05-09 13:02:33,463] Trial 23 finished with value: 0.2424612962282621 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.25014478913059135, 'learning_rate': 0.00019194865200249675, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.3146 | Val Loss: 0.2489


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.90it/s]


Epoch 1/10:
Train Loss: 3.1232 | Val Loss: 2.9891


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.85it/s]


Epoch 2/10:
Train Loss: 2.9910 | Val Loss: 2.9654


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.88it/s]


Epoch 3/10:
Train Loss: 2.6858 | Val Loss: 2.3328


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.86it/s]


Epoch 4/10:
Train Loss: 2.1899 | Val Loss: 2.1009


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.86it/s]


Epoch 5/10:
Train Loss: 1.9787 | Val Loss: 1.8899


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.87it/s]


Epoch 6/10:
Train Loss: 1.7925 | Val Loss: 1.6020


Training: 100%|██████████| 50/50 [00:11<00:00,  4.25it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.87it/s]


Epoch 7/10:
Train Loss: 1.4218 | Val Loss: 1.1488


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.91it/s]


Epoch 8/10:
Train Loss: 1.0800 | Val Loss: 0.8068


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.89it/s]


Epoch 9/10:
Train Loss: 0.8413 | Val Loss: 0.6614


Training: 100%|██████████| 50/50 [00:11<00:00,  4.26it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 12.91it/s]
[I 2025-05-09 13:04:41,536] Trial 24 finished with value: 0.5862870239294492 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.254993453117583, 'learning_rate': 0.0002517665702142728, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.6839 | Val Loss: 0.5863


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.55it/s]


Epoch 1/10:
Train Loss: 2.6587 | Val Loss: 2.1292


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 2/10:
Train Loss: 2.0363 | Val Loss: 1.8713


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 3/10:
Train Loss: 1.7892 | Val Loss: 1.5209


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.63it/s]


Epoch 4/10:
Train Loss: 1.4631 | Val Loss: 1.1192


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.52it/s]


Epoch 5/10:
Train Loss: 1.1656 | Val Loss: 0.8376


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.61it/s]


Epoch 6/10:
Train Loss: 0.9484 | Val Loss: 0.6562


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.65it/s]


Epoch 7/10:
Train Loss: 0.7941 | Val Loss: 0.6821


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.61it/s]


Epoch 8/10:
Train Loss: 0.6762 | Val Loss: 0.4943


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.72it/s]


Epoch 9/10:
Train Loss: 0.5932 | Val Loss: 0.4502


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.46it/s]
[I 2025-05-09 13:06:00,140] Trial 25 finished with value: 0.38724776070851546 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.23830973571725467, 'learning_rate': 0.00010082896885469244, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.5141 | Val Loss: 0.3872


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.50it/s]


Epoch 1/10:
Train Loss: 2.6834 | Val Loss: 2.1104


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.66it/s]


Epoch 2/10:
Train Loss: 2.0056 | Val Loss: 1.7507


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.61it/s]


Epoch 3/10:
Train Loss: 1.6395 | Val Loss: 1.3327


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.46it/s]


Epoch 4/10:
Train Loss: 1.2393 | Val Loss: 0.8734


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.72it/s]


Epoch 5/10:
Train Loss: 0.9473 | Val Loss: 0.6417


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 6/10:
Train Loss: 0.7754 | Val Loss: 0.5356


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.66it/s]


Epoch 7/10:
Train Loss: 0.6463 | Val Loss: 0.4830


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.56it/s]


Epoch 8/10:
Train Loss: 0.5489 | Val Loss: 0.4268


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 9/10:
Train Loss: 0.4931 | Val Loss: 0.4287


Training: 100%|██████████| 50/50 [00:07<00:00,  6.92it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.60it/s]
[I 2025-05-09 13:07:18,787] Trial 26 finished with value: 0.31171252635809094 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.32106844667524154, 'learning_rate': 0.00019561788217916238, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.4163 | Val Loss: 0.3117


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.64it/s]


Epoch 1/10:
Train Loss: 2.7873 | Val Loss: 2.1880


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.25it/s]


Epoch 2/10:
Train Loss: 1.9883 | Val Loss: 1.5924


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 3/10:
Train Loss: 1.3971 | Val Loss: 0.9714


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.70it/s]


Epoch 4/10:
Train Loss: 0.9782 | Val Loss: 0.6699


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 5/10:
Train Loss: 0.7322 | Val Loss: 0.4953


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.69it/s]


Epoch 6/10:
Train Loss: 0.5872 | Val Loss: 0.5122


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.69it/s]


Epoch 7/10:
Train Loss: 0.4802 | Val Loss: 0.3851


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.48it/s]


Epoch 8/10:
Train Loss: 0.4307 | Val Loss: 0.3278


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.39it/s]


Epoch 9/10:
Train Loss: 0.3710 | Val Loss: 0.2998


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.17it/s]
[I 2025-05-09 13:08:37,606] Trial 27 finished with value: 0.24743523849890783 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.28723658177856626, 'learning_rate': 0.00031996108500632756, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.3348 | Val Loss: 0.2474


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.21it/s]


Epoch 1/10:
Train Loss: 2.8444 | Val Loss: 2.2623


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.43it/s]


Epoch 2/10:
Train Loss: 2.0803 | Val Loss: 1.7358


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 3/10:
Train Loss: 1.4857 | Val Loss: 1.1382


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.22it/s]


Epoch 4/10:
Train Loss: 0.9748 | Val Loss: 0.7038


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.53it/s]


Epoch 5/10:
Train Loss: 0.7066 | Val Loss: 0.5257


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.58it/s]


Epoch 6/10:
Train Loss: 0.5606 | Val Loss: 0.4137


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.53it/s]


Epoch 7/10:
Train Loss: 0.4418 | Val Loss: 0.3734


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.73it/s]


Epoch 8/10:
Train Loss: 0.3820 | Val Loss: 0.3111


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.63it/s]


Epoch 9/10:
Train Loss: 0.3310 | Val Loss: 0.3358


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.46it/s]
[I 2025-05-09 13:09:56,276] Trial 28 finished with value: 0.27027979493141174 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.24474597421860195, 'learning_rate': 0.0003319605811782799, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.2887 | Val Loss: 0.2703


Training: 100%|██████████| 50/50 [00:16<00:00,  3.08it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.34it/s]


Epoch 1/10:
Train Loss: 3.1172 | Val Loss: 3.0015


Training: 100%|██████████| 50/50 [00:16<00:00,  3.08it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.28it/s]


Epoch 2/10:
Train Loss: 3.0026 | Val Loss: 2.9893


Training: 100%|██████████| 50/50 [00:16<00:00,  3.08it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.37it/s]


Epoch 3/10:
Train Loss: 2.9958 | Val Loss: 2.9953


Training: 100%|██████████| 50/50 [00:16<00:00,  3.08it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.24it/s]


Epoch 4/10:
Train Loss: 2.9897 | Val Loss: 2.9842


Training: 100%|██████████| 50/50 [00:16<00:00,  3.08it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.34it/s]


Epoch 5/10:
Train Loss: 2.9867 | Val Loss: 2.9867


Training: 100%|██████████| 50/50 [00:16<00:00,  3.08it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.28it/s]


Epoch 6/10:
Train Loss: 2.9817 | Val Loss: 2.9993


Training: 100%|██████████| 50/50 [00:16<00:00,  3.08it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  9.29it/s]
[I 2025-05-09 13:12:00,406] Trial 29 finished with value: 2.984180285380437 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.2877696576899604, 'learning_rate': 0.0006201148504505569, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 7/10:
Train Loss: 2.9680 | Val Loss: 3.0746
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.65it/s]


Epoch 1/10:
Train Loss: 2.7957 | Val Loss: 2.1626


Training: 100%|██████████| 50/50 [00:17<00:00,  2.84it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.71it/s]


Epoch 2/10:
Train Loss: 2.0437 | Val Loss: 1.8856


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.65it/s]


Epoch 3/10:
Train Loss: 1.8580 | Val Loss: 1.7500


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.67it/s]


Epoch 4/10:
Train Loss: 1.6619 | Val Loss: 1.4505


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.68it/s]


Epoch 5/10:
Train Loss: 1.3677 | Val Loss: 1.1456


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.70it/s]


Epoch 6/10:
Train Loss: 1.1018 | Val Loss: 0.8611


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.69it/s]


Epoch 7/10:
Train Loss: 0.8764 | Val Loss: 0.7037


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.67it/s]


Epoch 8/10:
Train Loss: 0.7231 | Val Loss: 0.5828


Training: 100%|██████████| 50/50 [00:17<00:00,  2.84it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.67it/s]


Epoch 9/10:
Train Loss: 0.6064 | Val Loss: 0.5068


Training: 100%|██████████| 50/50 [00:17<00:00,  2.85it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00,  8.62it/s]
[I 2025-05-09 13:15:11,717] Trial 30 finished with value: 0.43426405351895553 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.1729275180828992, 'learning_rate': 0.00010909050850543081, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.5280 | Val Loss: 0.4343


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 1/10:
Train Loss: 2.6437 | Val Loss: 2.0838


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.44it/s]


Epoch 2/10:
Train Loss: 1.9761 | Val Loss: 1.7482


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 3/10:
Train Loss: 1.5814 | Val Loss: 1.2060


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.56it/s]


Epoch 4/10:
Train Loss: 1.1728 | Val Loss: 0.7722


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.53it/s]


Epoch 5/10:
Train Loss: 0.8998 | Val Loss: 0.6353


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.49it/s]


Epoch 6/10:
Train Loss: 0.7155 | Val Loss: 0.5334


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.50it/s]


Epoch 7/10:
Train Loss: 0.5816 | Val Loss: 0.4048


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.71it/s]


Epoch 8/10:
Train Loss: 0.4996 | Val Loss: 0.3661


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.62it/s]


Epoch 9/10:
Train Loss: 0.4342 | Val Loss: 0.4632


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.56it/s]
[I 2025-05-09 13:16:30,371] Trial 31 finished with value: 0.3160318972972723 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.2914214437170369, 'learning_rate': 0.00019070450722494597, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.3866 | Val Loss: 0.3160


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.62it/s]


Epoch 1/10:
Train Loss: 2.7477 | Val Loss: 2.1757


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.41it/s]


Epoch 2/10:
Train Loss: 1.9844 | Val Loss: 1.6590


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.68it/s]


Epoch 3/10:
Train Loss: 1.3898 | Val Loss: 0.9416


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.68it/s]


Epoch 4/10:
Train Loss: 0.9523 | Val Loss: 0.8917


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.52it/s]


Epoch 5/10:
Train Loss: 0.7235 | Val Loss: 0.5360


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.62it/s]


Epoch 6/10:
Train Loss: 0.5608 | Val Loss: 0.4106


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 7/10:
Train Loss: 0.4518 | Val Loss: 0.4101


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.67it/s]


Epoch 8/10:
Train Loss: 0.3906 | Val Loss: 0.3590


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 9/10:
Train Loss: 0.3498 | Val Loss: 0.2786


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.50it/s]
[I 2025-05-09 13:17:49,003] Trial 32 finished with value: 0.25569619696873885 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.2624760371456977, 'learning_rate': 0.0003011065279815257, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.3071 | Val Loss: 0.2557


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 1/10:
Train Loss: 2.6310 | Val Loss: 2.1193


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 2/10:
Train Loss: 2.0421 | Val Loss: 1.8958


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.46it/s]


Epoch 3/10:
Train Loss: 1.7863 | Val Loss: 1.4702


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.55it/s]


Epoch 4/10:
Train Loss: 1.4694 | Val Loss: 1.0830


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.63it/s]


Epoch 5/10:
Train Loss: 1.1756 | Val Loss: 0.8080


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.25it/s]


Epoch 6/10:
Train Loss: 0.9491 | Val Loss: 0.6563


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.56it/s]


Epoch 7/10:
Train Loss: 0.8028 | Val Loss: 0.5230


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.20it/s]


Epoch 8/10:
Train Loss: 0.6848 | Val Loss: 0.4653


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.42it/s]


Epoch 9/10:
Train Loss: 0.6143 | Val Loss: 0.4751


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]
[I 2025-05-09 13:19:07,726] Trial 33 finished with value: 0.40406561814821684 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.32963948898939077, 'learning_rate': 0.00014638606911782692, 'batch_size': 32}. Best is trial 18 with value: 0.23081068350718573.


Epoch 10/10:
Train Loss: 0.5408 | Val Loss: 0.4041


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.57it/s]


Epoch 1/10:
Train Loss: 2.6241 | Val Loss: 2.0690


Training: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 2/10:
Train Loss: 1.8191 | Val Loss: 1.3931


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.68it/s]


Epoch 3/10:
Train Loss: 1.2227 | Val Loss: 0.8492


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 4/10:
Train Loss: 0.8111 | Val Loss: 0.6027


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.52it/s]


Epoch 5/10:
Train Loss: 0.6048 | Val Loss: 0.4578


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.64it/s]


Epoch 6/10:
Train Loss: 0.4719 | Val Loss: 0.3790


Training: 100%|██████████| 50/50 [00:07<00:00,  6.92it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 7/10:
Train Loss: 0.3782 | Val Loss: 0.3189


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


Epoch 8/10:
Train Loss: 0.3258 | Val Loss: 0.2942


Training: 100%|██████████| 50/50 [00:07<00:00,  6.91it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.27it/s]


Epoch 9/10:
Train Loss: 0.2856 | Val Loss: 0.2817


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.52it/s]


Epoch 10/10:
Train Loss: 0.2348 | Val Loss: 0.2122


[I 2025-05-09 13:20:26,738] Trial 34 finished with value: 0.21216037754829115 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.20169659103222026, 'learning_rate': 0.00023384807089141854, 'batch_size': 32}. Best is trial 34 with value: 0.21216037754829115.


New best model found! Val Loss: 0.2122
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.20169659103222026, 'learning_rate': 0.00023384807089141854, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:09<00:00,  5.02it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.17it/s]


Epoch 1/10:
Train Loss: 3.0982 | Val Loss: 2.9826


Training: 100%|██████████| 50/50 [00:09<00:00,  5.02it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.29it/s]


Epoch 2/10:
Train Loss: 2.9899 | Val Loss: 2.9833


Training: 100%|██████████| 50/50 [00:09<00:00,  5.01it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.15it/s]


Epoch 3/10:
Train Loss: 2.9870 | Val Loss: 2.9858


Training: 100%|██████████| 50/50 [00:09<00:00,  5.01it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.20it/s]


Epoch 4/10:
Train Loss: 2.9846 | Val Loss: 2.9782


Training: 100%|██████████| 50/50 [00:09<00:00,  5.02it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.23it/s]


Epoch 5/10:
Train Loss: 2.9822 | Val Loss: 2.9879


Training: 100%|██████████| 50/50 [00:09<00:00,  5.01it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.24it/s]


Epoch 6/10:
Train Loss: 2.9837 | Val Loss: 2.9865


Training: 100%|██████████| 50/50 [00:09<00:00,  5.01it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 15.24it/s]
[I 2025-05-09 13:21:43,033] Trial 35 finished with value: 2.9781792163848877 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.14727567843964387, 'learning_rate': 0.0003391744396867105, 'batch_size': 32}. Best is trial 34 with value: 0.21216037754829115.


Epoch 7/10:
Train Loss: 2.9767 | Val Loss: 3.0352
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:07<00:00,  6.85it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.43it/s]


Epoch 1/10:
Train Loss: 3.1151 | Val Loss: 3.0133


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.66it/s]


Epoch 2/10:
Train Loss: 2.9971 | Val Loss: 2.9898


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.59it/s]


Epoch 3/10:
Train Loss: 2.9907 | Val Loss: 2.9909


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.58it/s]


Epoch 4/10:
Train Loss: 2.9907 | Val Loss: 2.9844


Training: 100%|██████████| 50/50 [00:07<00:00,  6.95it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.54it/s]


Epoch 5/10:
Train Loss: 2.9856 | Val Loss: 2.9864


Training: 100%|██████████| 50/50 [00:07<00:00,  6.91it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.26it/s]


Epoch 6/10:
Train Loss: 2.9862 | Val Loss: 2.9882


Training: 100%|██████████| 50/50 [00:07<00:00,  6.92it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.35it/s]


Epoch 7/10:
Train Loss: 2.9830 | Val Loss: 2.9810


Training: 100%|██████████| 50/50 [00:07<00:00,  6.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.56it/s]


Epoch 8/10:
Train Loss: 2.9751 | Val Loss: 3.0808


Training: 100%|██████████| 50/50 [00:07<00:00,  6.92it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.40it/s]


Epoch 9/10:
Train Loss: 2.9574 | Val Loss: 3.2235


Training: 100%|██████████| 50/50 [00:07<00:00,  6.93it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 20.45it/s]
[I 2025-05-09 13:23:01,943] Trial 36 finished with value: 2.9809853847210226 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.21643625734869976, 'learning_rate': 0.0006264877591470762, 'batch_size': 32}. Best is trial 34 with value: 0.21216037754829115.


Epoch 10/10:
Train Loss: 2.9488 | Val Loss: 3.2350
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:08<00:00,  6.13it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.30it/s]


Epoch 1/10:
Train Loss: 2.7046 | Val Loss: 2.1993


Training: 100%|██████████| 50/50 [00:08<00:00,  6.12it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.14it/s]


Epoch 2/10:
Train Loss: 2.0311 | Val Loss: 1.7484


Training: 100%|██████████| 50/50 [00:08<00:00,  6.12it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.26it/s]


Epoch 3/10:
Train Loss: 1.4221 | Val Loss: 0.9034


Training: 100%|██████████| 50/50 [00:08<00:00,  6.12it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.11it/s]


Epoch 4/10:
Train Loss: 0.9251 | Val Loss: 0.6225


Training: 100%|██████████| 50/50 [00:08<00:00,  6.12it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.28it/s]


Epoch 5/10:
Train Loss: 0.6630 | Val Loss: 0.4635


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.38it/s]


Epoch 6/10:
Train Loss: 0.5143 | Val Loss: 0.4030


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.28it/s]


Epoch 7/10:
Train Loss: 0.4297 | Val Loss: 0.2900


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.29it/s]


Epoch 8/10:
Train Loss: 0.3624 | Val Loss: 0.3610


Training: 100%|██████████| 50/50 [00:08<00:00,  6.13it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.29it/s]


Epoch 9/10:
Train Loss: 0.3078 | Val Loss: 0.2473


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.35it/s]
[I 2025-05-09 13:24:30,992] Trial 37 finished with value: 0.2473479566665796 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.20054900043594653, 'learning_rate': 0.0002400129798287666, 'batch_size': 32}. Best is trial 34 with value: 0.21216037754829115.


Epoch 10/10:
Train Loss: 0.2729 | Val Loss: 0.2542


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.60it/s]


Epoch 1/10:
Train Loss: 2.8558 | Val Loss: 2.3593


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.56it/s]


Epoch 2/10:
Train Loss: 2.1963 | Val Loss: 2.0206


Training: 100%|██████████| 50/50 [00:13<00:00,  3.81it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.59it/s]


Epoch 3/10:
Train Loss: 1.9359 | Val Loss: 1.8391


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.60it/s]


Epoch 4/10:
Train Loss: 1.5646 | Val Loss: 1.2215


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.58it/s]


Epoch 5/10:
Train Loss: 1.0481 | Val Loss: 0.7889


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.64it/s]


Epoch 6/10:
Train Loss: 0.7776 | Val Loss: 0.6220


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.61it/s]


Epoch 7/10:
Train Loss: 0.6171 | Val Loss: 0.4862


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.62it/s]


Epoch 8/10:
Train Loss: 0.4854 | Val Loss: 0.3606


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.59it/s]


Epoch 9/10:
Train Loss: 0.4150 | Val Loss: 0.3136


Training: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.63it/s]
[I 2025-05-09 13:26:53,591] Trial 38 finished with value: 0.31360696256160736 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.20038513179144704, 'learning_rate': 0.0002432034384330036, 'batch_size': 32}. Best is trial 34 with value: 0.21216037754829115.


Epoch 10/10:
Train Loss: 0.3552 | Val Loss: 0.3884


Training: 100%|██████████| 50/50 [00:04<00:00, 11.97it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 35.86it/s]


Epoch 1/10:
Train Loss: 3.1272 | Val Loss: 2.9861


Training: 100%|██████████| 50/50 [00:04<00:00, 11.84it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 34.72it/s]


Epoch 2/10:
Train Loss: 2.9900 | Val Loss: 2.9863


Training: 100%|██████████| 50/50 [00:04<00:00, 12.12it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.42it/s]


Epoch 3/10:
Train Loss: 2.9840 | Val Loss: 2.9858


Training: 100%|██████████| 50/50 [00:04<00:00, 12.13it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 35.49it/s]


Epoch 4/10:
Train Loss: 2.9838 | Val Loss: 2.9812


Training: 100%|██████████| 50/50 [00:04<00:00, 11.94it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 35.94it/s]


Epoch 5/10:
Train Loss: 2.9819 | Val Loss: 2.9829


Training: 100%|██████████| 50/50 [00:04<00:00, 12.10it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.29it/s]


Epoch 6/10:
Train Loss: 2.9721 | Val Loss: 3.0868


Training: 100%|██████████| 50/50 [00:04<00:00, 11.99it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 36.08it/s]
[I 2025-05-09 13:27:25,443] Trial 39 finished with value: 2.981225472230178 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.1877534137157263, 'learning_rate': 0.0010894777880488234, 'batch_size': 32}. Best is trial 34 with value: 0.21216037754829115.


Epoch 7/10:
Train Loss: 2.9665 | Val Loss: 3.1241
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.07it/s]


Epoch 1/10:
Train Loss: 3.1597 | Val Loss: 3.0105


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.04it/s]


Epoch 2/10:
Train Loss: 3.0130 | Val Loss: 3.0161


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.05it/s]


Epoch 3/10:
Train Loss: 2.9963 | Val Loss: 2.9943


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 13.99it/s]


Epoch 4/10:
Train Loss: 2.9871 | Val Loss: 2.9973


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.03it/s]


Epoch 5/10:
Train Loss: 2.9792 | Val Loss: 2.9837


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.09it/s]


Epoch 6/10:
Train Loss: 2.9780 | Val Loss: 2.9929


Training: 100%|██████████| 50/50 [00:10<00:00,  4.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.04it/s]


Epoch 7/10:
Train Loss: 2.9765 | Val Loss: 2.9855


Training: 100%|██████████| 50/50 [00:10<00:00,  4.66it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 14.02it/s]
[I 2025-05-09 13:28:59,138] Trial 40 finished with value: 2.9837103256812463 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.22602115259340075, 'learning_rate': 0.004990508190538404, 'batch_size': 32}. Best is trial 34 with value: 0.21216037754829115.


Epoch 8/10:
Train Loss: 2.9751 | Val Loss: 2.9867
Early stopping triggered!


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.37it/s]


Epoch 1/10:
Train Loss: 2.5822 | Val Loss: 2.0814


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.27it/s]


Epoch 2/10:
Train Loss: 1.8738 | Val Loss: 1.5641


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.25it/s]


Epoch 3/10:
Train Loss: 1.3045 | Val Loss: 0.9265


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.27it/s]


Epoch 4/10:
Train Loss: 0.8561 | Val Loss: 0.5495


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.35it/s]


Epoch 5/10:
Train Loss: 0.6153 | Val Loss: 0.4804


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.32it/s]


Epoch 6/10:
Train Loss: 0.4840 | Val Loss: 0.3389


Training: 100%|██████████| 50/50 [00:08<00:00,  6.16it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.24it/s]


Epoch 7/10:
Train Loss: 0.3824 | Val Loss: 0.3118


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.26it/s]


Epoch 8/10:
Train Loss: 0.3402 | Val Loss: 0.2603


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.30it/s]


Epoch 9/10:
Train Loss: 0.2923 | Val Loss: 0.2032


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.32it/s]


Epoch 10/10:
Train Loss: 0.2478 | Val Loss: 0.2123


[I 2025-05-09 13:30:28,367] Trial 41 finished with value: 0.20317339438658494 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.12866753791718177, 'learning_rate': 0.00013900773134505292, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


New best model found! Val Loss: 0.2032
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.12866753791718177, 'learning_rate': 0.00013900773134505292, 'batch_size': 32}


Training: 100%|██████████| 50/50 [00:08<00:00,  6.16it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.37it/s]


Epoch 1/10:
Train Loss: 2.5669 | Val Loss: 2.0536


Training: 100%|██████████| 50/50 [00:08<00:00,  6.13it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.33it/s]


Epoch 2/10:
Train Loss: 1.8811 | Val Loss: 1.5891


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.41it/s]


Epoch 3/10:
Train Loss: 1.3807 | Val Loss: 1.0420


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.32it/s]


Epoch 4/10:
Train Loss: 0.9938 | Val Loss: 0.7209


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.31it/s]


Epoch 5/10:
Train Loss: 0.7281 | Val Loss: 0.5277


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.34it/s]


Epoch 6/10:
Train Loss: 0.5741 | Val Loss: 0.4076


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.39it/s]


Epoch 7/10:
Train Loss: 0.4774 | Val Loss: 0.3683


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.25it/s]


Epoch 8/10:
Train Loss: 0.3989 | Val Loss: 0.3679


Training: 100%|██████████| 50/50 [00:08<00:00,  6.16it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.25it/s]


Epoch 9/10:
Train Loss: 0.3412 | Val Loss: 0.2697


Training: 100%|██████████| 50/50 [00:08<00:00,  6.16it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.39it/s]
[I 2025-05-09 13:31:57,176] Trial 42 finished with value: 0.2326850209098596 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.14098179110017833, 'learning_rate': 0.00012077928034182723, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 0.3014 | Val Loss: 0.2327


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.29it/s]


Epoch 1/10:
Train Loss: 2.5398 | Val Loss: 2.0365


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.26it/s]


Epoch 2/10:
Train Loss: 1.8784 | Val Loss: 1.5790


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.33it/s]


Epoch 3/10:
Train Loss: 1.3420 | Val Loss: 0.9517


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.33it/s]


Epoch 4/10:
Train Loss: 0.9309 | Val Loss: 0.6603


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.45it/s]


Epoch 5/10:
Train Loss: 0.6829 | Val Loss: 0.4938


Training: 100%|██████████| 50/50 [00:08<00:00,  6.16it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.32it/s]


Epoch 6/10:
Train Loss: 0.5402 | Val Loss: 0.3964


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.29it/s]


Epoch 7/10:
Train Loss: 0.4280 | Val Loss: 0.3570


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.41it/s]


Epoch 8/10:
Train Loss: 0.3592 | Val Loss: 0.2612


Training: 100%|██████████| 50/50 [00:08<00:00,  6.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.38it/s]


Epoch 9/10:
Train Loss: 0.3105 | Val Loss: 0.2578


Training: 100%|██████████| 50/50 [00:08<00:00,  6.14it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.37it/s]
[I 2025-05-09 13:33:25,994] Trial 43 finished with value: 0.23372478439257696 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.1324914188960561, 'learning_rate': 0.00012583910720035825, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 0.2596 | Val Loss: 0.2337


Training: 100%|██████████| 50/50 [00:03<00:00, 13.68it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.66it/s]


Epoch 1/10:
Train Loss: 3.1599 | Val Loss: 2.6481


Training: 100%|██████████| 50/50 [00:03<00:00, 13.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.75it/s]


Epoch 2/10:
Train Loss: 2.4988 | Val Loss: 2.3203


Training: 100%|██████████| 50/50 [00:03<00:00, 13.73it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.30it/s]


Epoch 3/10:
Train Loss: 2.2673 | Val Loss: 2.1685


Training: 100%|██████████| 50/50 [00:03<00:00, 13.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.18it/s]


Epoch 4/10:
Train Loss: 2.1358 | Val Loss: 2.0658


Training: 100%|██████████| 50/50 [00:03<00:00, 13.69it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.52it/s]


Epoch 5/10:
Train Loss: 2.0359 | Val Loss: 1.9485


Training: 100%|██████████| 50/50 [00:03<00:00, 13.69it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.62it/s]


Epoch 6/10:
Train Loss: 1.9473 | Val Loss: 1.8339


Training: 100%|██████████| 50/50 [00:03<00:00, 13.67it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.76it/s]


Epoch 7/10:
Train Loss: 1.8238 | Val Loss: 1.6668


Training: 100%|██████████| 50/50 [00:03<00:00, 13.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.61it/s]


Epoch 8/10:
Train Loss: 1.6830 | Val Loss: 1.5079


Training: 100%|██████████| 50/50 [00:03<00:00, 13.71it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.66it/s]


Epoch 9/10:
Train Loss: 1.5615 | Val Loss: 1.3773


Training: 100%|██████████| 50/50 [00:03<00:00, 13.70it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 37.73it/s]
[I 2025-05-09 13:34:06,096] Trial 44 finished with value: 1.270866559101985 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.13195475645312726, 'learning_rate': 0.00012852948673462822, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 1.4557 | Val Loss: 1.2709


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.23it/s]


Epoch 1/10:
Train Loss: 2.8746 | Val Loss: 2.2610


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 2/10:
Train Loss: 2.1176 | Val Loss: 1.9315


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.14it/s]


Epoch 3/10:
Train Loss: 1.7109 | Val Loss: 1.4092


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 4/10:
Train Loss: 1.1512 | Val Loss: 0.8388


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 5/10:
Train Loss: 0.8112 | Val Loss: 0.6062


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.18it/s]


Epoch 6/10:
Train Loss: 0.6031 | Val Loss: 0.5992


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 7/10:
Train Loss: 0.4643 | Val Loss: 0.3352


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 8/10:
Train Loss: 0.3849 | Val Loss: 0.3158


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 9/10:
Train Loss: 0.3183 | Val Loss: 0.2815


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.20it/s]
[I 2025-05-09 13:36:35,839] Trial 45 finished with value: 0.21736416965723038 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.10359883718228224, 'learning_rate': 0.00011505183274086612, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 0.2700 | Val Loss: 0.2174


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.18it/s]


Epoch 1/10:
Train Loss: 2.7295 | Val Loss: 2.1913


Training: 100%|██████████| 50/50 [00:13<00:00,  3.62it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.18it/s]


Epoch 2/10:
Train Loss: 2.0311 | Val Loss: 1.8218


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 3/10:
Train Loss: 1.6001 | Val Loss: 1.2169


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 4/10:
Train Loss: 1.0799 | Val Loss: 0.7887


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.18it/s]


Epoch 5/10:
Train Loss: 0.7648 | Val Loss: 0.5690


Training: 100%|██████████| 50/50 [00:13<00:00,  3.63it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.15it/s]


Epoch 6/10:
Train Loss: 0.5794 | Val Loss: 0.4928


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.17it/s]


Epoch 7/10:
Train Loss: 0.4735 | Val Loss: 0.3935


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 8/10:
Train Loss: 0.3860 | Val Loss: 0.2949


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 9/10:
Train Loss: 0.3199 | Val Loss: 0.2671


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.18it/s]
[I 2025-05-09 13:39:05,705] Trial 46 finished with value: 0.24729947975048652 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.10458521880899309, 'learning_rate': 0.00010024173417325842, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 0.2729 | Val Loss: 0.2473


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.22it/s]


Epoch 1/10:
Train Loss: 2.8332 | Val Loss: 2.3119


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.17it/s]


Epoch 2/10:
Train Loss: 2.1406 | Val Loss: 1.9901


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 3/10:
Train Loss: 1.8221 | Val Loss: 1.4835


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 4/10:
Train Loss: 1.2310 | Val Loss: 0.8961


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.18it/s]


Epoch 5/10:
Train Loss: 0.8357 | Val Loss: 0.6438


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 6/10:
Train Loss: 0.6242 | Val Loss: 0.4200


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.15it/s]


Epoch 7/10:
Train Loss: 0.5530 | Val Loss: 0.6039


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 8/10:
Train Loss: 0.4220 | Val Loss: 0.3585


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 9/10:
Train Loss: 0.3534 | Val Loss: 0.2915


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.20it/s]
[I 2025-05-09 13:41:35,480] Trial 47 finished with value: 0.2644098951266362 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.1242015692671329, 'learning_rate': 0.00013162510926539403, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 0.2988 | Val Loss: 0.2644


Training: 100%|██████████| 50/50 [00:05<00:00,  9.78it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.58it/s]


Epoch 1/10:
Train Loss: 3.1010 | Val Loss: 2.5983


Training: 100%|██████████| 50/50 [00:04<00:00, 10.04it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 31.01it/s]


Epoch 2/10:
Train Loss: 2.4522 | Val Loss: 2.3344


Training: 100%|██████████| 50/50 [00:04<00:00, 10.17it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.25it/s]


Epoch 3/10:
Train Loss: 2.2387 | Val Loss: 2.1460


Training: 100%|██████████| 50/50 [00:05<00:00,  9.85it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 31.03it/s]


Epoch 4/10:
Train Loss: 2.0848 | Val Loss: 2.0297


Training: 100%|██████████| 50/50 [00:04<00:00, 10.08it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.71it/s]


Epoch 5/10:
Train Loss: 1.9727 | Val Loss: 1.8742


Training: 100%|██████████| 50/50 [00:05<00:00,  9.92it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.96it/s]


Epoch 6/10:
Train Loss: 1.8054 | Val Loss: 1.6621


Training: 100%|██████████| 50/50 [00:04<00:00, 10.15it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 31.12it/s]


Epoch 7/10:
Train Loss: 1.6294 | Val Loss: 1.4451


Training: 100%|██████████| 50/50 [00:05<00:00,  9.83it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.59it/s]


Epoch 8/10:
Train Loss: 1.4477 | Val Loss: 1.2532


Training: 100%|██████████| 50/50 [00:05<00:00, 10.00it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 31.01it/s]


Epoch 9/10:
Train Loss: 1.2772 | Val Loss: 1.0644


Training: 100%|██████████| 50/50 [00:05<00:00,  9.91it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 30.64it/s]
[I 2025-05-09 13:42:30,072] Trial 48 finished with value: 0.8957875325129583 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.10164551851479525, 'learning_rate': 0.00016663918544712513, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 1.1326 | Val Loss: 0.8958


Training: 100%|██████████| 50/50 [00:13<00:00,  3.62it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 1/10:
Train Loss: 2.8814 | Val Loss: 2.3068


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.20it/s]


Epoch 2/10:
Train Loss: 2.1452 | Val Loss: 2.0144


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 3/10:
Train Loss: 1.8865 | Val Loss: 1.6981


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.15it/s]


Epoch 4/10:
Train Loss: 1.4440 | Val Loss: 1.0598


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.17it/s]


Epoch 5/10:
Train Loss: 1.0002 | Val Loss: 0.7250


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.21it/s]


Epoch 6/10:
Train Loss: 0.7243 | Val Loss: 0.5867


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.22it/s]


Epoch 7/10:
Train Loss: 0.5900 | Val Loss: 0.4101


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]


Epoch 8/10:
Train Loss: 0.4634 | Val Loss: 0.4536


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.18it/s]


Epoch 9/10:
Train Loss: 0.3812 | Val Loss: 0.2642


Training: 100%|██████████| 50/50 [00:13<00:00,  3.64it/s]
Evaluating: 100%|██████████| 13/13 [00:01<00:00, 11.19it/s]
[I 2025-05-09 13:44:59,913] Trial 49 finished with value: 0.2641711773780676 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.15425304090465908, 'learning_rate': 0.00011711466020470574, 'batch_size': 32}. Best is trial 41 with value: 0.20317339438658494.


Epoch 10/10:
Train Loss: 0.3442 | Val Loss: 0.2644

Best trial:
  Validation Loss: 0.2032
  Params: 
    d_model: 512
    num_heads: 8
    num_layers: 6
    d_ff: 1024
    dropout: 0.12866753791718177
    learning_rate: 0.00013900773134505292
    batch_size: 32


Evaluating: 100%|██████████| 50/50 [00:02<00:00, 17.72it/s]
Evaluating: 100%|██████████| 13/13 [00:00<00:00, 18.39it/s]



Final Evaluation:
Train Loss: 0.1424 | Val Loss: 0.2123
Train Accuracy: 0.9509 | Val Accuracy: 0.9322

Test Decryptions:
Input: 'Please decrypt the following using Caesar cipher: gfbs' | Output: 'UZIRK JURKED BL MA SCKNIMP JUSIN JUG JOM.' | Expected: 'fear' | ✗
Input: 'Please decrypt the following using Caesar cipher: dpnqvufs' | Output: 'UZIRK JUSKED BL MA SCKNIMP JUST JOM JUSKIZ.' | Expected: 'computer' | ✗
Input: 'Please decrypt the following using Caesar cipher:xibu' | Output: 'UZIRK JUSKED BL MA SCKNIMP JUSING JUZIZ.' | Expected: 'what' | ✗
