In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Vocabulary Class
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Data Preparation
def load_data(file_path, max_samples=1000):
    df = pd.read_csv(file_path)

    # Filter rows where 'Output' length is <=200
    df = df[df['Output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)

    # Randomly select samples (if available)
    if count_filtered > max_samples:
        df = df.sample(n=max_samples, random_state=42)

    inputs = df['Input'].tolist()
    outputs = df['Output'].tolist()

    return inputs, outputs

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Improved Model Architecture for Caesar Cipher
class CaesarTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Simplified positional encoding (learned instead of fixed)
        self.encoder_pos = nn.Embedding(max_seq_length, d_model)
        self.decoder_pos = nn.Embedding(max_seq_length, d_model)

        # Encoder layers
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model, num_heads, d_ff, dropout, batch_first=True)
            for _ in range(num_layers)
        ])

        # Decoder layers
        self.decoder_layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model, num_heads, d_ff, dropout, batch_first=True)
            for _ in range(num_layers)
        ])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.max_seq_length = max_seq_length

    def forward(self, src, tgt):
        # Create masks
        src_mask = (src == 0)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)

        # Embed and add positional encoding
        src_pos = torch.arange(0, src.size(1), device=device).unsqueeze(0)
        tgt_pos = torch.arange(0, tgt.size(1), device=device).unsqueeze(0)

        src_embedded = self.dropout(self.encoder_embedding(src) + self.encoder_pos(src_pos))
        tgt_embedded = self.dropout(self.decoder_embedding(tgt) + self.decoder_pos(tgt_pos))

        # Encoder
        memory = src_embedded
        for layer in self.encoder_layers:
            memory = layer(memory, src_key_padding_mask=src_mask)

        # Decoder
        output = tgt_embedded
        for layer in self.decoder_layers:
            output = layer(output, memory, tgt_mask=tgt_mask, memory_key_padding_mask=src_mask)

        return self.fc(output)

# Training and Evaluation Functions (similar to your original)
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model = None

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            best_model = copy.deepcopy(model.state_dict())
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break

    if best_model is not None:
        model.load_state_dict(best_model)
    return best_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Global variables for Optuna
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Optuna Objective Function
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [64, 128, 256]),
        "num_heads": trial.suggest_categorical("num_heads", [4, 8]),
        "num_layers": trial.suggest_categorical("num_layers", [2, 4, 6]),
        "d_ff": trial.suggest_categorical("d_ff", [128, 256, 512]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.3),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32, 64]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = CaesarTransformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, 'best_caesar_model.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('training_newshift_1.csv')
    vocab = Vocabulary()
    max_length = 256

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)  # 20 trials

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = CaesarTransformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('best_caesar_model.pth'))

    # Evaluate
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher: xibu", "what"),
        ("Please decrypt the following using Caesar cipher: ifmmp", "hello"),
        ("Please decrypt the following using Caesar cipher: uijt", "this")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cpu


[I 2025-05-06 17:58:04,179] A new study created in memory with name: no-name-8b3dc0b9-ada0-47d7-aa11-860d46536572
Training: 100%|██████████| 13/13 [01:25<00:00,  6.54s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


Epoch 1/10:
Train Loss: 4.0666 | Val Loss: 3.7088


Training: 100%|██████████| 13/13 [01:23<00:00,  6.41s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.00s/it]


Epoch 2/10:
Train Loss: 3.6629 | Val Loss: 3.5250


Training: 100%|██████████| 13/13 [01:22<00:00,  6.31s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


Epoch 3/10:
Train Loss: 3.4948 | Val Loss: 3.3822


Training: 100%|██████████| 13/13 [01:22<00:00,  6.34s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]


Epoch 4/10:
Train Loss: 3.3620 | Val Loss: 3.2595


Training: 100%|██████████| 13/13 [01:23<00:00,  6.41s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.01s/it]


Epoch 5/10:
Train Loss: 3.2398 | Val Loss: 3.1162


Training: 100%|██████████| 13/13 [01:22<00:00,  6.36s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]


Epoch 6/10:
Train Loss: 3.1152 | Val Loss: 2.9750


Training: 100%|██████████| 13/13 [01:22<00:00,  6.37s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


Epoch 7/10:
Train Loss: 2.9843 | Val Loss: 2.8526


Training: 100%|██████████| 13/13 [01:22<00:00,  6.36s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


Epoch 8/10:
Train Loss: 2.8787 | Val Loss: 2.7515


Training: 100%|██████████| 13/13 [01:21<00:00,  6.30s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.00s/it]


Epoch 9/10:
Train Loss: 2.7898 | Val Loss: 2.6693


Training: 100%|██████████| 13/13 [01:22<00:00,  6.31s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.05s/it]
[I 2025-05-06 18:12:36,398] Trial 0 finished with value: 2.610978841781616 and parameters: {'d_model': 64, 'num_heads': 8, 'num_layers': 6, 'd_ff': 128, 'dropout': 0.1806497578025868, 'learning_rate': 0.00028782326078834576, 'batch_size': 64}. Best is trial 0 with value: 2.610978841781616.


Epoch 10/10:
Train Loss: 2.7172 | Val Loss: 2.6110
New best model found! Val Loss: 2.6110
Config: {'d_model': 64, 'num_heads': 8, 'num_layers': 6, 'd_ff': 128, 'dropout': 0.1806497578025868, 'learning_rate': 0.00028782326078834576, 'batch_size': 64}


Training: 100%|██████████| 13/13 [01:18<00:00,  6.00s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.15s/it]


Epoch 1/10:
Train Loss: 3.3778 | Val Loss: 2.8527


Training: 100%|██████████| 13/13 [01:16<00:00,  5.85s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


Epoch 2/10:
Train Loss: 2.6641 | Val Loss: 2.4772


Training: 100%|██████████| 13/13 [01:12<00:00,  5.57s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.15s/it]


Epoch 3/10:
Train Loss: 2.4185 | Val Loss: 2.3955


Training: 100%|██████████| 13/13 [01:13<00:00,  5.67s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


Epoch 4/10:
Train Loss: 2.3199 | Val Loss: 2.3017


Training: 100%|██████████| 13/13 [01:14<00:00,  5.73s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]


Epoch 5/10:
Train Loss: 2.2534 | Val Loss: 2.2649


Training: 100%|██████████| 13/13 [01:13<00:00,  5.68s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]


Epoch 6/10:
Train Loss: 2.1826 | Val Loss: 2.2248


Training: 100%|██████████| 13/13 [01:14<00:00,  5.75s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.18s/it]


Epoch 7/10:
Train Loss: 2.1642 | Val Loss: 2.2127


Training: 100%|██████████| 13/13 [01:14<00:00,  5.71s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


Epoch 8/10:
Train Loss: 2.1006 | Val Loss: 2.1800


Training: 100%|██████████| 13/13 [01:12<00:00,  5.58s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.15s/it]


Epoch 9/10:
Train Loss: 2.0433 | Val Loss: 2.2589


Training: 100%|██████████| 13/13 [01:13<00:00,  5.63s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]
[I 2025-05-06 18:25:46,031] Trial 1 finished with value: 2.1572062373161316 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 4, 'd_ff': 512, 'dropout': 0.18760098545752418, 'learning_rate': 0.0008626906207424592, 'batch_size': 64}. Best is trial 1 with value: 2.1572062373161316.


Epoch 10/10:
Train Loss: 2.0275 | Val Loss: 2.1572
New best model found! Val Loss: 2.1572
Config: {'d_model': 256, 'num_heads': 8, 'num_layers': 4, 'd_ff': 512, 'dropout': 0.18760098545752418, 'learning_rate': 0.0008626906207424592, 'batch_size': 64}


Training: 100%|██████████| 13/13 [01:13<00:00,  5.69s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.25s/it]


Epoch 1/10:
Train Loss: 3.5182 | Val Loss: 3.1300


Training: 100%|██████████| 13/13 [01:12<00:00,  5.57s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


Epoch 2/10:
Train Loss: 3.0051 | Val Loss: 2.7336


Training: 100%|██████████| 13/13 [01:12<00:00,  5.56s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it]


Epoch 3/10:
Train Loss: 2.6574 | Val Loss: 2.5339


Training: 100%|██████████| 13/13 [01:12<00:00,  5.55s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it]


Epoch 4/10:
Train Loss: 2.4741 | Val Loss: 2.4180


Training: 100%|██████████| 13/13 [01:12<00:00,  5.55s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.25s/it]


Epoch 5/10:
Train Loss: 2.3886 | Val Loss: 2.3752


Training: 100%|██████████| 13/13 [01:12<00:00,  5.56s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


Epoch 6/10:
Train Loss: 2.3191 | Val Loss: 2.3206


Training: 100%|██████████| 13/13 [01:11<00:00,  5.53s/it]
Evaluating: 100%|██████████| 4/4 [00:05<00:00,  1.25s/it]


Epoch 7/10:
Train Loss: 2.2722 | Val Loss: 2.2745


Training: 100%|██████████| 13/13 [01:12<00:00,  5.60s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.25s/it]


Epoch 8/10:
Train Loss: 2.2292 | Val Loss: 2.2600


Training: 100%|██████████| 13/13 [01:13<00:00,  5.66s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


Epoch 9/10:
Train Loss: 2.2010 | Val Loss: 2.2771


Training: 100%|██████████| 13/13 [01:12<00:00,  5.59s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.20s/it]
[I 2025-05-06 18:38:41,876] Trial 2 finished with value: 2.21791672706604 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 128, 'dropout': 0.21344136549506787, 'learning_rate': 0.0003625225035613965, 'batch_size': 64}. Best is trial 1 with value: 2.1572062373161316.


Epoch 10/10:
Train Loss: 2.1599 | Val Loss: 2.2179


Training: 100%|██████████| 13/13 [00:58<00:00,  4.49s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


Epoch 1/10:
Train Loss: 3.5754 | Val Loss: 3.1491


Training: 100%|██████████| 13/13 [01:02<00:00,  4.84s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


Epoch 2/10:
Train Loss: 3.1161 | Val Loss: 2.8570


Training: 100%|██████████| 13/13 [01:03<00:00,  4.89s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.08it/s]


Epoch 3/10:
Train Loss: 2.8482 | Val Loss: 2.6066


Training: 100%|██████████| 13/13 [01:05<00:00,  5.03s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.09it/s]


Epoch 4/10:
Train Loss: 2.6254 | Val Loss: 2.4757


Training: 100%|██████████| 13/13 [01:04<00:00,  4.96s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.09it/s]


Epoch 5/10:
Train Loss: 2.4976 | Val Loss: 2.4131


Training: 100%|██████████| 13/13 [01:05<00:00,  5.04s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


Epoch 6/10:
Train Loss: 2.4347 | Val Loss: 2.3917


Training: 100%|██████████| 13/13 [01:04<00:00,  4.95s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]


Epoch 7/10:
Train Loss: 2.3923 | Val Loss: 2.3431


Training: 100%|██████████| 13/13 [01:04<00:00,  4.96s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


Epoch 8/10:
Train Loss: 2.3527 | Val Loss: 2.3344


Training: 100%|██████████| 13/13 [01:04<00:00,  4.97s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.08it/s]


Epoch 9/10:
Train Loss: 2.3174 | Val Loss: 2.3277


Training: 100%|██████████| 13/13 [01:05<00:00,  5.01s/it]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.09it/s]
[I 2025-05-06 18:49:58,545] Trial 3 finished with value: 2.2958542704582214 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 4, 'd_ff': 128, 'dropout': 0.29708054022979347, 'learning_rate': 0.00030321158775512035, 'batch_size': 64}. Best is trial 1 with value: 2.1572062373161316.


Epoch 10/10:
Train Loss: 2.2954 | Val Loss: 2.2959


Training: 100%|██████████| 13/13 [01:13<00:00,  5.65s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]


Epoch 1/10:
Train Loss: 3.6990 | Val Loss: 3.2109


Training: 100%|██████████| 13/13 [01:12<00:00,  5.56s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.12s/it]


Epoch 2/10:
Train Loss: 3.1839 | Val Loss: 3.0339


Training: 100%|██████████| 13/13 [01:10<00:00,  5.46s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]


Epoch 3/10:
Train Loss: 2.9740 | Val Loss: 2.7579


Training: 100%|██████████| 13/13 [01:12<00:00,  5.58s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.14s/it]


Epoch 4/10:
Train Loss: 2.7457 | Val Loss: 2.5862


Training: 100%|██████████| 13/13 [01:12<00:00,  5.59s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]


Epoch 5/10:
Train Loss: 2.5852 | Val Loss: 2.4980


Training: 100%|██████████| 13/13 [01:13<00:00,  5.66s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


Epoch 6/10:
Train Loss: 2.4977 | Val Loss: 2.4474


Training: 100%|██████████| 13/13 [01:13<00:00,  5.64s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


Epoch 7/10:
Train Loss: 2.4402 | Val Loss: 2.3931


Training: 100%|██████████| 13/13 [01:12<00:00,  5.54s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.12s/it]


Epoch 8/10:
Train Loss: 2.3935 | Val Loss: 2.3597


Training: 100%|██████████| 13/13 [01:14<00:00,  5.71s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]


Epoch 9/10:
Train Loss: 2.3543 | Val Loss: 2.3441


Training: 100%|██████████| 13/13 [01:13<00:00,  5.64s/it]
Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.14s/it]
[I 2025-05-06 19:02:52,011] Trial 4 finished with value: 2.321357250213623 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 4, 'd_ff': 512, 'dropout': 0.15980520658068967, 'learning_rate': 0.00011240521884227886, 'batch_size': 64}. Best is trial 1 with value: 2.1572062373161316.


Epoch 10/10:
Train Loss: 2.3219 | Val Loss: 2.3214


Training: 100%|██████████| 25/25 [00:15<00:00,  1.59it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.44it/s]


Epoch 1/10:
Train Loss: 4.1090 | Val Loss: 3.5357


Training: 100%|██████████| 25/25 [00:15<00:00,  1.61it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.52it/s]


Epoch 2/10:
Train Loss: 3.4337 | Val Loss: 3.2259


Training: 100%|██████████| 25/25 [00:15<00:00,  1.59it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.58it/s]


Epoch 3/10:
Train Loss: 3.2003 | Val Loss: 3.0216


Training: 100%|██████████| 25/25 [00:15<00:00,  1.59it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.63it/s]


Epoch 4/10:
Train Loss: 3.0121 | Val Loss: 2.8260


Training: 100%|██████████| 25/25 [00:15<00:00,  1.60it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.79it/s]


Epoch 5/10:
Train Loss: 2.8379 | Val Loss: 2.6820


Training: 100%|██████████| 25/25 [00:15<00:00,  1.63it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.48it/s]


Epoch 6/10:
Train Loss: 2.7205 | Val Loss: 2.5812


Training: 100%|██████████| 25/25 [00:15<00:00,  1.62it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.76it/s]


Epoch 7/10:
Train Loss: 2.6327 | Val Loss: 2.5162


Training: 100%|██████████| 25/25 [00:15<00:00,  1.61it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.48it/s]


Epoch 8/10:
Train Loss: 2.5755 | Val Loss: 2.4737


Training: 100%|██████████| 25/25 [00:15<00:00,  1.59it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.69it/s]


Epoch 9/10:
Train Loss: 2.5333 | Val Loss: 2.4355


Training: 100%|██████████| 25/25 [00:15<00:00,  1.62it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.53it/s]
[I 2025-05-06 19:05:38,507] Trial 5 finished with value: 2.4178689547947476 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 2, 'd_ff': 512, 'dropout': 0.2356549572334513, 'learning_rate': 0.00014066199960109234, 'batch_size': 32}. Best is trial 1 with value: 2.1572062373161316.


Epoch 10/10:
Train Loss: 2.4980 | Val Loss: 2.4179


Training: 100%|██████████| 13/13 [00:16<00:00,  1.30s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.99it/s]


Epoch 1/10:
Train Loss: 4.4247 | Val Loss: 4.0177


Training: 100%|██████████| 13/13 [00:16<00:00,  1.27s/it]
Evaluating: 100%|██████████| 4/4 [00:00<00:00,  4.07it/s]


Epoch 2/10:
Train Loss: 3.8385 | Val Loss: 3.5487


Training: 100%|██████████| 13/13 [00:16<00:00,  1.28s/it]
Evaluating: 100%|██████████| 4/4 [00:00<00:00,  4.03it/s]


Epoch 3/10:
Train Loss: 3.4997 | Val Loss: 3.3253


Training: 100%|██████████| 13/13 [00:16<00:00,  1.27s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.35it/s]


Epoch 4/10:
Train Loss: 3.3326 | Val Loss: 3.2204


Training: 100%|██████████| 13/13 [00:16<00:00,  1.28s/it]
Evaluating: 100%|██████████| 4/4 [00:00<00:00,  4.02it/s]


Epoch 5/10:
Train Loss: 3.2422 | Val Loss: 3.1519


Training: 100%|██████████| 13/13 [00:16<00:00,  1.27s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.97it/s]


Epoch 6/10:
Train Loss: 3.1781 | Val Loss: 3.0945


Training: 100%|██████████| 13/13 [00:16<00:00,  1.29s/it]
Evaluating: 100%|██████████| 4/4 [00:00<00:00,  4.01it/s]


Epoch 7/10:
Train Loss: 3.1200 | Val Loss: 3.0311


Training: 100%|██████████| 13/13 [00:16<00:00,  1.26s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.91it/s]


Epoch 8/10:
Train Loss: 3.0557 | Val Loss: 2.9557


Training: 100%|██████████| 13/13 [00:16<00:00,  1.28s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.28it/s]


Epoch 9/10:
Train Loss: 2.9894 | Val Loss: 2.8765


Training: 100%|██████████| 13/13 [00:16<00:00,  1.27s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.95it/s]
[I 2025-05-06 19:08:35,116] Trial 6 finished with value: 2.8025622367858887 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 2, 'd_ff': 128, 'dropout': 0.19812697203852775, 'learning_rate': 0.00012462631450262343, 'batch_size': 64}. Best is trial 1 with value: 2.1572062373161316.


Epoch 10/10:
Train Loss: 2.9152 | Val Loss: 2.8026


Training: 100%|██████████| 25/25 [01:02<00:00,  2.48s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


Epoch 1/10:
Train Loss: 3.7169 | Val Loss: 3.3825


Training: 100%|██████████| 25/25 [01:01<00:00,  2.47s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.94it/s]


Epoch 2/10:
Train Loss: 3.3153 | Val Loss: 3.0800


Training: 100%|██████████| 25/25 [01:01<00:00,  2.46s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.78it/s]


Epoch 3/10:
Train Loss: 3.0187 | Val Loss: 2.7873


Training: 100%|██████████| 25/25 [01:01<00:00,  2.47s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.84it/s]


Epoch 4/10:
Train Loss: 2.7853 | Val Loss: 2.6342


Training: 100%|██████████| 25/25 [01:01<00:00,  2.47s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.86it/s]


Epoch 5/10:
Train Loss: 2.6542 | Val Loss: 2.5426


Training: 100%|██████████| 25/25 [01:01<00:00,  2.45s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.92it/s]


Epoch 6/10:
Train Loss: 2.5696 | Val Loss: 2.4890


Training: 100%|██████████| 25/25 [01:01<00:00,  2.46s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.90it/s]


Epoch 7/10:
Train Loss: 2.5046 | Val Loss: 2.4419


Training: 100%|██████████| 25/25 [01:02<00:00,  2.49s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


Epoch 8/10:
Train Loss: 2.4586 | Val Loss: 2.4065


Training: 100%|██████████| 25/25 [01:01<00:00,  2.46s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.92it/s]


Epoch 9/10:
Train Loss: 2.4208 | Val Loss: 2.3941


Training: 100%|██████████| 25/25 [01:02<00:00,  2.50s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.90it/s]
[I 2025-05-06 19:19:30,063] Trial 7 finished with value: 2.3475963388170515 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1757424483450994, 'learning_rate': 0.00012190292804054613, 'batch_size': 32}. Best is trial 1 with value: 2.1572062373161316.


Epoch 10/10:
Train Loss: 2.3887 | Val Loss: 2.3476


Training: 100%|██████████| 25/25 [00:57<00:00,  2.31s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.68it/s]


Epoch 1/10:
Train Loss: 3.2617 | Val Loss: 2.7030


Training: 100%|██████████| 25/25 [00:58<00:00,  2.32s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.71it/s]


Epoch 2/10:
Train Loss: 2.5864 | Val Loss: 2.4244


Training: 100%|██████████| 25/25 [00:57<00:00,  2.29s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.66it/s]


Epoch 3/10:
Train Loss: 2.4004 | Val Loss: 2.3202


Training: 100%|██████████| 25/25 [00:57<00:00,  2.30s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.70it/s]


Epoch 4/10:
Train Loss: 2.3135 | Val Loss: 2.3292


Training: 100%|██████████| 25/25 [00:57<00:00,  2.29s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.70it/s]


Epoch 5/10:
Train Loss: 2.2460 | Val Loss: 2.2701


Training: 100%|██████████| 25/25 [00:56<00:00,  2.28s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.73it/s]


Epoch 6/10:
Train Loss: 2.1865 | Val Loss: 2.1938


Training: 100%|██████████| 25/25 [00:57<00:00,  2.30s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.73it/s]


Epoch 7/10:
Train Loss: 2.1100 | Val Loss: 2.1121


Training: 100%|██████████| 25/25 [00:56<00:00,  2.28s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.69it/s]


Epoch 8/10:
Train Loss: 2.0171 | Val Loss: 1.9939


Training: 100%|██████████| 25/25 [00:56<00:00,  2.28s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.72it/s]


Epoch 9/10:
Train Loss: 1.9351 | Val Loss: 1.9637


Training: 100%|██████████| 25/25 [00:56<00:00,  2.28s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.64it/s]
[I 2025-05-06 19:29:44,922] Trial 8 finished with value: 1.9082178899220057 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 512, 'dropout': 0.22436399000666726, 'learning_rate': 0.0003465846228006628, 'batch_size': 32}. Best is trial 8 with value: 1.9082178899220057.


Epoch 10/10:
Train Loss: 1.8681 | Val Loss: 1.9082
New best model found! Val Loss: 1.9082
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 512, 'dropout': 0.22436399000666726, 'learning_rate': 0.0003465846228006628, 'batch_size': 32}


Training: 100%|██████████| 13/13 [00:15<00:00,  1.20s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.82it/s]


Epoch 1/10:
Train Loss: 4.0016 | Val Loss: 3.4443


Training: 100%|██████████| 13/13 [00:16<00:00,  1.24s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.70it/s]


Epoch 2/10:
Train Loss: 3.3790 | Val Loss: 3.1780


Training: 100%|██████████| 13/13 [00:15<00:00,  1.20s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.72it/s]


Epoch 3/10:
Train Loss: 3.1455 | Val Loss: 2.9383


Training: 100%|██████████| 13/13 [00:15<00:00,  1.21s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.77it/s]


Epoch 4/10:
Train Loss: 2.9340 | Val Loss: 2.7492


Training: 100%|██████████| 13/13 [00:15<00:00,  1.20s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.72it/s]


Epoch 5/10:
Train Loss: 2.7785 | Val Loss: 2.6201


Training: 100%|██████████| 13/13 [00:15<00:00,  1.22s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.81it/s]


Epoch 6/10:
Train Loss: 2.6656 | Val Loss: 2.5300


Training: 100%|██████████| 13/13 [00:15<00:00,  1.21s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.84it/s]


Epoch 7/10:
Train Loss: 2.5976 | Val Loss: 2.4834


Training: 100%|██████████| 13/13 [00:16<00:00,  1.24s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.79it/s]


Epoch 8/10:
Train Loss: 2.5429 | Val Loss: 2.4403


Training: 100%|██████████| 13/13 [00:15<00:00,  1.20s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.93it/s]


Epoch 9/10:
Train Loss: 2.4980 | Val Loss: 2.4164


Training: 100%|██████████| 13/13 [00:15<00:00,  1.21s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  3.78it/s]
[I 2025-05-06 19:32:33,048] Trial 9 finished with value: 2.3862498998641968 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 2, 'd_ff': 512, 'dropout': 0.2774962021202698, 'learning_rate': 0.00032985226791552294, 'batch_size': 64}. Best is trial 8 with value: 1.9082178899220057.


Epoch 10/10:
Train Loss: 2.4717 | Val Loss: 2.3862


Training: 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  5.45it/s]


Epoch 1/10:
Train Loss: 3.7044 | Val Loss: 3.2937


Training: 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.44it/s]


Epoch 2/10:
Train Loss: 3.0922 | Val Loss: 2.8002


Training: 100%|██████████| 25/25 [00:18<00:00,  1.34it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.29it/s]


Epoch 3/10:
Train Loss: 2.6983 | Val Loss: 2.5563


Training: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.24it/s]


Epoch 4/10:
Train Loss: 2.5279 | Val Loss: 2.4455


Training: 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.11it/s]


Epoch 5/10:
Train Loss: 2.4325 | Val Loss: 2.3911


Training: 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.35it/s]


Epoch 6/10:
Train Loss: 2.3769 | Val Loss: 2.3468


Training: 100%|██████████| 25/25 [00:18<00:00,  1.34it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.22it/s]


Epoch 7/10:
Train Loss: 2.3364 | Val Loss: 2.3208


Training: 100%|██████████| 25/25 [00:18<00:00,  1.33it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.20it/s]


Epoch 8/10:
Train Loss: 2.2964 | Val Loss: 2.2859


Training: 100%|██████████| 25/25 [00:18<00:00,  1.33it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.33it/s]


Epoch 9/10:
Train Loss: 2.2590 | Val Loss: 2.2645


Training: 100%|██████████| 25/25 [00:18<00:00,  1.36it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.22it/s]
[I 2025-05-06 19:35:51,004] Trial 10 finished with value: 2.2499005453927174 and parameters: {'d_model': 64, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.11190072239609894, 'learning_rate': 0.0006867120310921098, 'batch_size': 32}. Best is trial 8 with value: 1.9082178899220057.


Epoch 10/10:
Train Loss: 2.2313 | Val Loss: 2.2499


Training: 100%|██████████| 25/25 [00:57<00:00,  2.29s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.72it/s]


Epoch 1/10:
Train Loss: 3.2662 | Val Loss: 2.7433


Training: 100%|██████████| 25/25 [00:56<00:00,  2.26s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.59it/s]


Epoch 2/10:
Train Loss: 2.6080 | Val Loss: 2.4127


Training: 100%|██████████| 25/25 [00:57<00:00,  2.30s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.70it/s]


Epoch 3/10:
Train Loss: 2.3922 | Val Loss: 2.3223


Training: 100%|██████████| 25/25 [00:57<00:00,  2.29s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.62it/s]


Epoch 4/10:
Train Loss: 2.3130 | Val Loss: 2.2940


Training: 100%|██████████| 25/25 [00:57<00:00,  2.29s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.65it/s]


Epoch 5/10:
Train Loss: 2.2495 | Val Loss: 2.2431


Training: 100%|██████████| 25/25 [00:58<00:00,  2.36s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


Epoch 6/10:
Train Loss: 2.2086 | Val Loss: 2.2184


Training: 100%|██████████| 25/25 [00:58<00:00,  2.34s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.62it/s]


Epoch 7/10:
Train Loss: 2.1522 | Val Loss: 2.2228


Training: 100%|██████████| 25/25 [00:58<00:00,  2.34s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.68it/s]


Epoch 8/10:
Train Loss: 2.1109 | Val Loss: 2.1761


Training: 100%|██████████| 25/25 [00:58<00:00,  2.33s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.67it/s]


Epoch 9/10:
Train Loss: 2.0780 | Val Loss: 2.1339


Training: 100%|██████████| 25/25 [00:59<00:00,  2.37s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.50it/s]
[I 2025-05-06 19:46:13,565] Trial 11 finished with value: 2.11050740310124 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 512, 'dropout': 0.2508413240308535, 'learning_rate': 0.0009656945229595863, 'batch_size': 32}. Best is trial 8 with value: 1.9082178899220057.


Epoch 10/10:
Train Loss: 2.0549 | Val Loss: 2.1105


Training: 100%|██████████| 25/25 [00:53<00:00,  2.15s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.78it/s]


Epoch 1/10:
Train Loss: 3.1998 | Val Loss: 2.6334


Training: 100%|██████████| 25/25 [00:54<00:00,  2.19s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.92it/s]


Epoch 2/10:
Train Loss: 2.5569 | Val Loss: 2.4046


Training: 100%|██████████| 25/25 [00:55<00:00,  2.21s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.76it/s]


Epoch 3/10:
Train Loss: 2.3832 | Val Loss: 2.3046


Training: 100%|██████████| 25/25 [00:54<00:00,  2.17s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.68it/s]


Epoch 4/10:
Train Loss: 2.3041 | Val Loss: 2.2468


Training: 100%|██████████| 25/25 [00:55<00:00,  2.21s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.87it/s]


Epoch 5/10:
Train Loss: 2.2436 | Val Loss: 2.2095


Training: 100%|██████████| 25/25 [00:53<00:00,  2.14s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.91it/s]


Epoch 6/10:
Train Loss: 2.1848 | Val Loss: 2.1513


Training: 100%|██████████| 25/25 [00:54<00:00,  2.17s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.84it/s]


Epoch 7/10:
Train Loss: 2.1071 | Val Loss: 2.0772


Training: 100%|██████████| 25/25 [00:51<00:00,  2.06s/it]
Evaluating: 100%|██████████| 7/7 [00:04<00:00,  1.69it/s]


Epoch 8/10:
Train Loss: 2.0190 | Val Loss: 2.0023


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.00it/s]


Epoch 9/10:
Train Loss: 1.9533 | Val Loss: 1.9927


Training: 100%|██████████| 25/25 [00:50<00:00,  2.02s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.92it/s]
[I 2025-05-06 19:55:44,656] Trial 12 finished with value: 1.9332362072808402 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.24782982950217544, 'learning_rate': 0.0005465133856200783, 'batch_size': 32}. Best is trial 8 with value: 1.9082178899220057.


Epoch 10/10:
Train Loss: 1.8803 | Val Loss: 1.9332


Training: 100%|██████████| 25/25 [00:50<00:00,  2.03s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.94it/s]


Epoch 1/10:
Train Loss: 3.2301 | Val Loss: 2.6521


Training: 100%|██████████| 25/25 [00:50<00:00,  2.00s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.98it/s]


Epoch 2/10:
Train Loss: 2.5532 | Val Loss: 2.4501


Training: 100%|██████████| 25/25 [00:50<00:00,  2.00s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


Epoch 3/10:
Train Loss: 2.3877 | Val Loss: 2.3489


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.86it/s]


Epoch 4/10:
Train Loss: 2.3033 | Val Loss: 2.2774


Training: 100%|██████████| 25/25 [00:50<00:00,  2.00s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.84it/s]


Epoch 5/10:
Train Loss: 2.2307 | Val Loss: 2.2547


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.97it/s]


Epoch 6/10:
Train Loss: 2.1841 | Val Loss: 2.2293


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.97it/s]


Epoch 7/10:
Train Loss: 2.1304 | Val Loss: 2.1438


Training: 100%|██████████| 25/25 [00:50<00:00,  2.00s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.97it/s]


Epoch 8/10:
Train Loss: 2.0501 | Val Loss: 2.0701


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.95it/s]


Epoch 9/10:
Train Loss: 1.9516 | Val Loss: 2.0085


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.92it/s]
[I 2025-05-06 20:04:40,193] Trial 13 finished with value: 1.9657188653945923 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.23644088178894326, 'learning_rate': 0.00048772112826803203, 'batch_size': 32}. Best is trial 8 with value: 1.9082178899220057.


Epoch 10/10:
Train Loss: 1.8688 | Val Loss: 1.9657


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.90it/s]


Epoch 1/10:
Train Loss: 3.4442 | Val Loss: 3.0729


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.86it/s]


Epoch 2/10:
Train Loss: 2.9304 | Val Loss: 2.6351


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.97it/s]


Epoch 3/10:
Train Loss: 2.6082 | Val Loss: 2.4654


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.99it/s]


Epoch 4/10:
Train Loss: 2.4736 | Val Loss: 2.3803


Training: 100%|██████████| 25/25 [00:49<00:00,  1.96s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


Epoch 5/10:
Train Loss: 2.4006 | Val Loss: 2.3665


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.87it/s]


Epoch 6/10:
Train Loss: 2.3469 | Val Loss: 2.3201


Training: 100%|██████████| 25/25 [00:49<00:00,  1.97s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.00it/s]


Epoch 7/10:
Train Loss: 2.3122 | Val Loss: 2.3082


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.98it/s]


Epoch 8/10:
Train Loss: 2.2865 | Val Loss: 2.2860


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.99it/s]


Epoch 9/10:
Train Loss: 2.2486 | Val Loss: 2.2754


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.95it/s]
[I 2025-05-06 20:13:31,618] Trial 14 finished with value: 2.2207350049700056 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.26373853401974967, 'learning_rate': 0.00020246306351207473, 'batch_size': 32}. Best is trial 8 with value: 1.9082178899220057.


Epoch 10/10:
Train Loss: 2.2240 | Val Loss: 2.2207


Training: 100%|██████████| 25/25 [00:49<00:00,  1.97s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.89it/s]


Epoch 1/10:
Train Loss: 3.1559 | Val Loss: 2.6349


Training: 100%|██████████| 25/25 [00:49<00:00,  1.97s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.96it/s]


Epoch 2/10:
Train Loss: 2.5186 | Val Loss: 2.4048


Training: 100%|██████████| 25/25 [00:49<00:00,  1.98s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.94it/s]


Epoch 3/10:
Train Loss: 2.3596 | Val Loss: 2.3058


Training: 100%|██████████| 25/25 [00:50<00:00,  2.00s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.90it/s]


Epoch 4/10:
Train Loss: 2.2739 | Val Loss: 2.2449


Training: 100%|██████████| 25/25 [00:50<00:00,  2.00s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


Epoch 5/10:
Train Loss: 2.2097 | Val Loss: 2.2328


Training: 100%|██████████| 25/25 [00:49<00:00,  2.00s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.88it/s]


Epoch 6/10:
Train Loss: 2.1211 | Val Loss: 2.2113


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.86it/s]


Epoch 7/10:
Train Loss: 2.0240 | Val Loss: 2.0196


Training: 100%|██████████| 25/25 [00:50<00:00,  2.01s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.96it/s]


Epoch 8/10:
Train Loss: 1.9321 | Val Loss: 1.9266


Training: 100%|██████████| 25/25 [00:50<00:00,  2.03s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.94it/s]


Epoch 9/10:
Train Loss: 1.8481 | Val Loss: 1.9102


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.96it/s]
[I 2025-05-06 20:22:26,910] Trial 15 finished with value: 1.8922669717243739 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.2207862787473907, 'learning_rate': 0.0004987914424108684, 'batch_size': 32}. Best is trial 15 with value: 1.8922669717243739.


Epoch 10/10:
Train Loss: 1.7858 | Val Loss: 1.8923
New best model found! Val Loss: 1.8923
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.2207862787473907, 'learning_rate': 0.0004987914424108684, 'batch_size': 32}


Training: 100%|██████████| 25/25 [00:18<00:00,  1.37it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.43it/s]


Epoch 1/10:
Train Loss: 3.8567 | Val Loss: 3.4380


Training: 100%|██████████| 25/25 [00:18<00:00,  1.36it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  5.97it/s]


Epoch 2/10:
Train Loss: 3.2917 | Val Loss: 3.0335


Training: 100%|██████████| 25/25 [00:18<00:00,  1.38it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.32it/s]


Epoch 3/10:
Train Loss: 2.9627 | Val Loss: 2.7540


Training: 100%|██████████| 25/25 [00:18<00:00,  1.35it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.28it/s]


Epoch 4/10:
Train Loss: 2.7494 | Val Loss: 2.6061


Training: 100%|██████████| 25/25 [00:18<00:00,  1.36it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.39it/s]


Epoch 5/10:
Train Loss: 2.6353 | Val Loss: 2.5202


Training: 100%|██████████| 25/25 [00:18<00:00,  1.37it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  5.40it/s]


Epoch 6/10:
Train Loss: 2.5553 | Val Loss: 2.4675


Training: 100%|██████████| 25/25 [00:18<00:00,  1.36it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.45it/s]


Epoch 7/10:
Train Loss: 2.5041 | Val Loss: 2.4252


Training: 100%|██████████| 25/25 [00:18<00:00,  1.37it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.29it/s]


Epoch 8/10:
Train Loss: 2.4645 | Val Loss: 2.3920


Training: 100%|██████████| 25/25 [00:18<00:00,  1.34it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  6.40it/s]


Epoch 9/10:
Train Loss: 2.4294 | Val Loss: 2.3828


Training: 100%|██████████| 25/25 [00:18<00:00,  1.37it/s]
Evaluating: 100%|██████████| 7/7 [00:01<00:00,  5.58it/s]
[I 2025-05-06 20:25:41,856] Trial 16 finished with value: 2.3567378520965576 and parameters: {'d_model': 64, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.2174667231342672, 'learning_rate': 0.0004531981723602948, 'batch_size': 32}. Best is trial 15 with value: 1.8922669717243739.


Epoch 10/10:
Train Loss: 2.4069 | Val Loss: 2.3567


Training: 100%|██████████| 25/25 [00:49<00:00,  1.97s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.95it/s]


Epoch 1/10:
Train Loss: 3.3086 | Val Loss: 2.8255


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.97it/s]


Epoch 2/10:
Train Loss: 2.6542 | Val Loss: 2.4686


Training: 100%|██████████| 25/25 [00:49<00:00,  1.97s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.95it/s]


Epoch 3/10:
Train Loss: 2.4091 | Val Loss: 2.3569


Training: 100%|██████████| 25/25 [00:48<00:00,  1.96s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.89it/s]


Epoch 4/10:
Train Loss: 2.3281 | Val Loss: 2.3190


Training: 100%|██████████| 25/25 [00:49<00:00,  1.97s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


Epoch 5/10:
Train Loss: 2.2563 | Val Loss: 2.2561


Training: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.98it/s]


Epoch 6/10:
Train Loss: 2.1943 | Val Loss: 2.2242


Training: 100%|██████████| 25/25 [00:49<00:00,  1.96s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.98it/s]


Epoch 7/10:
Train Loss: 2.1279 | Val Loss: 2.1644


Training: 100%|██████████| 25/25 [00:49<00:00,  1.96s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.99it/s]


Epoch 8/10:
Train Loss: 2.0352 | Val Loss: 2.0350


Training: 100%|██████████| 25/25 [00:48<00:00,  1.96s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.89it/s]


Epoch 9/10:
Train Loss: 1.9551 | Val Loss: 2.0064


Training: 100%|██████████| 25/25 [00:49<00:00,  1.96s/it]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.01it/s]
[I 2025-05-06 20:34:30,217] Trial 17 finished with value: 1.9547168867928642 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.15666245482944202, 'learning_rate': 0.00023959789092595416, 'batch_size': 32}. Best is trial 15 with value: 1.8922669717243739.


Epoch 10/10:
Train Loss: 1.8897 | Val Loss: 1.9547


Training: 100%|██████████| 25/25 [01:23<00:00,  3.35s/it]
Evaluating: 100%|██████████| 7/7 [00:05<00:00,  1.18it/s]


Epoch 1/10:
Train Loss: 3.3241 | Val Loss: 3.0963


Training: 100%|██████████| 25/25 [01:23<00:00,  3.33s/it]
Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.10it/s]


Epoch 2/10:
Train Loss: 3.0840 | Val Loss: 2.8132


Training: 100%|██████████| 25/25 [01:22<00:00,  3.31s/it]
Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.16it/s]


Epoch 3/10:
Train Loss: 2.5540 | Val Loss: 2.4195


Training: 100%|██████████| 25/25 [01:23<00:00,  3.33s/it]
Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.15it/s]


Epoch 4/10:
Train Loss: 2.3505 | Val Loss: 2.3139


Training: 100%|██████████| 25/25 [01:23<00:00,  3.32s/it]
Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.14it/s]


Epoch 5/10:
Train Loss: 2.2498 | Val Loss: 2.2682


Training: 100%|██████████| 25/25 [01:22<00:00,  3.30s/it]
Evaluating: 100%|██████████| 7/7 [00:05<00:00,  1.17it/s]


Epoch 6/10:
Train Loss: 2.1658 | Val Loss: 2.2225


Training: 100%|██████████| 25/25 [01:22<00:00,  3.29s/it]
Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.16it/s]


Epoch 7/10:
Train Loss: 2.0997 | Val Loss: 2.2169


Training: 100%|██████████| 25/25 [01:23<00:00,  3.33s/it]
Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.14it/s]


Epoch 8/10:
Train Loss: 2.0265 | Val Loss: 2.1805


Training: 100%|██████████| 25/25 [01:22<00:00,  3.31s/it]
Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.16it/s]


Epoch 9/10:
Train Loss: 1.9587 | Val Loss: 2.1680


Training: 100%|██████████| 25/25 [01:23<00:00,  3.33s/it]
Evaluating: 100%|██████████| 7/7 [00:05<00:00,  1.18it/s]
[I 2025-05-06 20:49:21,209] Trial 18 finished with value: 2.132347890308925 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12694279083614296, 'learning_rate': 0.0006396338561965474, 'batch_size': 32}. Best is trial 15 with value: 1.8922669717243739.


Epoch 10/10:
Train Loss: 1.9028 | Val Loss: 2.1323


Training: 100%|██████████| 25/25 [00:09<00:00,  2.69it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.20it/s]


Epoch 1/10:
Train Loss: 4.3191 | Val Loss: 3.8408


Training: 100%|██████████| 25/25 [00:09<00:00,  2.71it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.09it/s]


Epoch 2/10:
Train Loss: 3.7204 | Val Loss: 3.4997


Training: 100%|██████████| 25/25 [00:09<00:00,  2.74it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.13it/s]


Epoch 3/10:
Train Loss: 3.4775 | Val Loss: 3.3260


Training: 100%|██████████| 25/25 [00:09<00:00,  2.71it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.35it/s]


Epoch 4/10:
Train Loss: 3.3288 | Val Loss: 3.2130


Training: 100%|██████████| 25/25 [00:09<00:00,  2.76it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 11.60it/s]


Epoch 5/10:
Train Loss: 3.2279 | Val Loss: 3.1159


Training: 100%|██████████| 25/25 [00:09<00:00,  2.72it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.67it/s]


Epoch 6/10:
Train Loss: 3.1411 | Val Loss: 3.0199


Training: 100%|██████████| 25/25 [00:09<00:00,  2.65it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.45it/s]


Epoch 7/10:
Train Loss: 3.0494 | Val Loss: 2.9211


Training: 100%|██████████| 25/25 [00:09<00:00,  2.70it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.30it/s]


Epoch 8/10:
Train Loss: 2.9604 | Val Loss: 2.8303


Training: 100%|██████████| 25/25 [00:09<00:00,  2.73it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.21it/s]


Epoch 9/10:
Train Loss: 2.8860 | Val Loss: 2.7550


Training: 100%|██████████| 25/25 [00:09<00:00,  2.70it/s]
Evaluating: 100%|██████████| 7/7 [00:00<00:00, 12.02it/s]
[I 2025-05-06 20:50:59,315] Trial 19 finished with value: 2.6956261226109097 and parameters: {'d_model': 64, 'num_heads': 4, 'num_layers': 2, 'd_ff': 256, 'dropout': 0.21728103203375954, 'learning_rate': 0.00017309529211692837, 'batch_size': 32}. Best is trial 15 with value: 1.8922669717243739.


Epoch 10/10:
Train Loss: 2.8150 | Val Loss: 2.6956

Best trial:
  Validation Loss: 1.8923
  Params: 
    d_model: 256
    num_heads: 4
    num_layers: 4
    d_ff: 256
    dropout: 0.2207862787473907
    learning_rate: 0.0004987914424108684
    batch_size: 32


Evaluating: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  1.98it/s]



Final Evaluation:
Train Loss: 1.5589 | Val Loss: 1.8923
Train Accuracy: 0.5228 | Val Accuracy: 0.4348

Test Decryptions:
Input: 'Please decrypt the following using Caesar cipher: gfbs' | Output: 'Thered tomedyo the thathe th thero there the the the the there thathe the there thine there the the the there the the the there the the the the thathe the the the the thisthero t the the the thin there the the there t the the thero the the thare the t thi' | Expected: 'fear' | ✗
Input: 'Please decrypt the following using Caesar cipher: dpnqvufs' | Output: 'Thered tomedyo the thathe th thero there the the the thererere thathe the there thine there the the the there the the the there the the the the thathe the the the the thisthero the the the the the t the the the the the thine the the the thero the the the ' | Expected: 'computer' | ✗
Input: 'Please decrypt the following using Caesar cipher: xibu' | Output: 'Thered tomedyo the thathe th thero there the the the the there thathe the there thine