In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Preparation

def load_data(file_path):
    df = pd.read_csv(file_path)

    # Filter rows where 'Output' length is <=200
    df = df[df['Output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)


    if count_filtered > 100:
        df = df.sample(n=100, random_state=42)

    inputs = df['Input'].tolist()
    outputs = df['Output'].tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break
    return best_val_loss  # Return the best validation loss from this training run

# Global variables to track best model across all trials
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Hyperparameter Optimization with Optuna
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [128, 256, 512, 1024]),
        "num_heads": trial.suggest_categorical("num_heads", [4, 8, 16, 32, 64]),
        "num_layers": trial.suggest_categorical("num_layers", [8, 10, 12, 24, 48]),
        "d_ff": trial.suggest_categorical("d_ff", [256, 512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.05, 0.4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [16]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss for this configuration
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=5)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_caesar_5.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('/content/training_newshift_1.csv')
    vocab = Vocabulary()
    max_length = 256

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)  # 20 trials or 1 hour

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('/content/drive/MyDrive/best_caesar_5.pth'))

    # Evaluate on full datasets
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher:xibu", "what")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cpu


[I 2025-05-07 08:48:56,340] A new study created in memory with name: no-name-5957f2a6-3cce-4b38-acca-f9be97f15ea3
Training: 100%|██████████| 5/5 [01:16<00:00, 15.27s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.69s/it]


Epoch 1/5:
Train Loss: 3.7517 | Val Loss: 3.1860


Training: 100%|██████████| 5/5 [01:03<00:00, 12.67s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.79s/it]


Epoch 2/5:
Train Loss: 3.1816 | Val Loss: 3.1286


Training: 100%|██████████| 5/5 [01:13<00:00, 14.62s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.73s/it]


Epoch 3/5:
Train Loss: 3.1379 | Val Loss: 3.1401


Training: 100%|██████████| 5/5 [01:00<00:00, 12.15s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.76s/it]


Epoch 4/5:
Train Loss: 3.1458 | Val Loss: 3.1405


Training: 100%|██████████| 5/5 [01:01<00:00, 12.26s/it]
Evaluating: 100%|██████████| 2/2 [00:04<00:00,  2.05s/it]


Epoch 5/5:
Train Loss: 3.1323 | Val Loss: 3.1354
Early stopping triggered!


[I 2025-05-07 08:55:00,714] Trial 0 finished with value: 3.128631591796875 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 24, 'd_ff': 256, 'dropout': 0.29785288366674223, 'learning_rate': 0.0026370692885369224, 'batch_size': 16}. Best is trial 0 with value: 3.128631591796875.


New best model found! Val Loss: 3.1286
Config: {'d_model': 256, 'num_heads': 16, 'num_layers': 24, 'd_ff': 256, 'dropout': 0.29785288366674223, 'learning_rate': 0.0026370692885369224, 'batch_size': 16}


Training: 100%|██████████| 5/5 [00:09<00:00,  1.91s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  2.39it/s]


Epoch 1/5:
Train Loss: 3.9285 | Val Loss: 3.3248


Training: 100%|██████████| 5/5 [00:10<00:00,  2.02s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s]


Epoch 2/5:
Train Loss: 3.1972 | Val Loss: 3.1480


Training: 100%|██████████| 5/5 [00:09<00:00,  1.91s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  2.39it/s]


Epoch 3/5:
Train Loss: 3.1445 | Val Loss: 3.1273


Training: 100%|██████████| 5/5 [00:09<00:00,  1.93s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  2.39it/s]


Epoch 4/5:
Train Loss: 3.1142 | Val Loss: 3.1335


Training: 100%|██████████| 5/5 [00:09<00:00,  1.93s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s]
[I 2025-05-07 08:55:53,559] Trial 1 finished with value: 3.1165547370910645 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.07477403407988911, 'learning_rate': 0.0017429029205439238, 'batch_size': 16}. Best is trial 1 with value: 3.1165547370910645.


Epoch 5/5:
Train Loss: 3.1206 | Val Loss: 3.1166
New best model found! Val Loss: 3.1166
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.07477403407988911, 'learning_rate': 0.0017429029205439238, 'batch_size': 16}


Training: 100%|██████████| 5/5 [01:30<00:00, 18.11s/it]
Evaluating: 100%|██████████| 2/2 [00:09<00:00,  4.81s/it]


Epoch 1/5:
Train Loss: 3.9050 | Val Loss: 3.5373


Training: 100%|██████████| 5/5 [01:27<00:00, 17.48s/it]
Evaluating: 100%|██████████| 2/2 [00:09<00:00,  4.55s/it]


Epoch 2/5:
Train Loss: 3.4796 | Val Loss: 3.4229


Training: 100%|██████████| 5/5 [01:28<00:00, 17.62s/it]
Evaluating: 100%|██████████| 2/2 [00:09<00:00,  4.75s/it]


Epoch 3/5:
Train Loss: 3.3608 | Val Loss: 3.2937


Training: 100%|██████████| 5/5 [01:28<00:00, 17.62s/it]
Evaluating: 100%|██████████| 2/2 [00:09<00:00,  4.66s/it]


Epoch 4/5:
Train Loss: 3.2017 | Val Loss: 3.1256


Training: 100%|██████████| 5/5 [01:27<00:00, 17.42s/it]
Evaluating: 100%|██████████| 2/2 [00:09<00:00,  4.69s/it]
[I 2025-05-07 09:04:01,920] Trial 2 finished with value: 3.0405783653259277 and parameters: {'d_model': 128, 'num_heads': 64, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.07716354365620323, 'learning_rate': 0.0003781438079385575, 'batch_size': 16}. Best is trial 2 with value: 3.0405783653259277.


Epoch 5/5:
Train Loss: 2.9665 | Val Loss: 3.0406
New best model found! Val Loss: 3.0406
Config: {'d_model': 128, 'num_heads': 64, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.07716354365620323, 'learning_rate': 0.0003781438079385575, 'batch_size': 16}


Training: 100%|██████████| 5/5 [01:15<00:00, 15.07s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.07s/it]


Epoch 1/5:
Train Loss: 24.2899 | Val Loss: 59.2690


Training: 100%|██████████| 5/5 [01:13<00:00, 14.79s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.04s/it]


Epoch 2/5:
Train Loss: 59.1122 | Val Loss: 54.8076


Training: 100%|██████████| 5/5 [01:13<00:00, 14.72s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.11s/it]


Epoch 3/5:
Train Loss: 46.6533 | Val Loss: 50.7513


Training: 100%|██████████| 5/5 [01:14<00:00, 14.98s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.07s/it]


Epoch 4/5:
Train Loss: 38.8452 | Val Loss: 51.7273


Training: 100%|██████████| 5/5 [01:14<00:00, 14.84s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.12s/it]
[I 2025-05-07 09:10:45,455] Trial 3 finished with value: 13.262210845947266 and parameters: {'d_model': 1024, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2883942069728467, 'learning_rate': 0.05136395368071393, 'batch_size': 16}. Best is trial 2 with value: 3.0405783653259277.


Epoch 5/5:
Train Loss: 24.9412 | Val Loss: 13.2622


Training: 100%|██████████| 5/5 [00:19<00:00,  3.92s/it]
Evaluating: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


Epoch 1/5:
Train Loss: 3.9276 | Val Loss: 3.3066


Training: 100%|██████████| 5/5 [00:20<00:00,  4.01s/it]
Evaluating: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]


Epoch 2/5:
Train Loss: 3.2293 | Val Loss: 3.1465


Training: 100%|██████████| 5/5 [00:19<00:00,  3.93s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


Epoch 3/5:
Train Loss: 3.1291 | Val Loss: 3.1257


Training: 100%|██████████| 5/5 [00:20<00:00,  4.16s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]


Epoch 4/5:
Train Loss: 3.1212 | Val Loss: 3.1209


Training: 100%|██████████| 5/5 [00:19<00:00,  3.94s/it]
Evaluating: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
[I 2025-05-07 09:12:35,586] Trial 4 finished with value: 3.1200751066207886 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 24, 'd_ff': 256, 'dropout': 0.24524513052117597, 'learning_rate': 0.002112101483378704, 'batch_size': 16}. Best is trial 2 with value: 3.0405783653259277.


Epoch 5/5:
Train Loss: 3.1085 | Val Loss: 3.1201


Training: 100%|██████████| 5/5 [00:38<00:00,  7.72s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.82s/it]


Epoch 1/5:
Train Loss: 3.8553 | Val Loss: 3.3687


Training: 100%|██████████| 5/5 [00:48<00:00,  9.60s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.96s/it]


Epoch 2/5:
Train Loss: 3.3213 | Val Loss: 3.2535


Training: 100%|██████████| 5/5 [01:04<00:00, 12.89s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.63s/it]


Epoch 3/5:
Train Loss: 3.1053 | Val Loss: 3.0334


Training: 100%|██████████| 5/5 [00:38<00:00,  7.76s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.66s/it]


Epoch 4/5:
Train Loss: 2.8195 | Val Loss: 2.8939


Training: 100%|██████████| 5/5 [00:38<00:00,  7.66s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]


Epoch 5/5:
Train Loss: 2.6327 | Val Loss: 2.7840


[I 2025-05-07 09:16:43,101] Trial 5 finished with value: 2.784000277519226 and parameters: {'d_model': 256, 'num_heads': 32, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.1323467788673861, 'learning_rate': 0.00023489726805644886, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


New best model found! Val Loss: 2.7840
Config: {'d_model': 256, 'num_heads': 32, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.1323467788673861, 'learning_rate': 0.00023489726805644886, 'batch_size': 16}


Training: 100%|██████████| 5/5 [01:35<00:00, 19.10s/it]
Evaluating: 100%|██████████| 2/2 [00:07<00:00,  3.52s/it]


Epoch 1/5:
Train Loss: 8.9425 | Val Loss: 16.4531


Training: 100%|██████████| 5/5 [01:30<00:00, 18.07s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.46s/it]


Epoch 2/5:
Train Loss: 11.5588 | Val Loss: 12.7127


Training: 100%|██████████| 5/5 [01:31<00:00, 18.25s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.46s/it]


Epoch 3/5:
Train Loss: 10.8212 | Val Loss: 12.3122


Training: 100%|██████████| 5/5 [01:31<00:00, 18.28s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.47s/it]


Epoch 4/5:
Train Loss: 7.9857 | Val Loss: 7.7860


Training: 100%|██████████| 5/5 [01:33<00:00, 18.67s/it]
Evaluating: 100%|██████████| 2/2 [00:12<00:00,  6.06s/it]
[I 2025-05-07 09:25:05,471] Trial 6 finished with value: 4.8192222118377686 and parameters: {'d_model': 1024, 'num_heads': 32, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.33022175067096715, 'learning_rate': 0.01800062664829373, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 5.7901 | Val Loss: 4.8192


Training: 100%|██████████| 5/5 [00:31<00:00,  6.21s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]


Epoch 1/5:
Train Loss: 4.4396 | Val Loss: 4.0612


Training: 100%|██████████| 5/5 [00:28<00:00,  5.64s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]


Epoch 2/5:
Train Loss: 3.9230 | Val Loss: 3.7727


Training: 100%|██████████| 5/5 [00:27<00:00,  5.59s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]


Epoch 3/5:
Train Loss: 3.6901 | Val Loss: 3.6184


Training: 100%|██████████| 5/5 [00:27<00:00,  5.55s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]


Epoch 4/5:
Train Loss: 3.5623 | Val Loss: 3.5220


Training: 100%|██████████| 5/5 [00:27<00:00,  5.55s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
[I 2025-05-07 09:27:39,398] Trial 7 finished with value: 3.4576538801193237 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.08576350453652332, 'learning_rate': 1.0440106671227166e-05, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.4723 | Val Loss: 3.4577


Training: 100%|██████████| 5/5 [04:47<00:00, 57.51s/it]
Evaluating: 100%|██████████| 2/2 [00:37<00:00, 18.90s/it]


Epoch 1/5:
Train Loss: 22.8031 | Val Loss: 61.4602


Training: 100%|██████████| 5/5 [04:26<00:00, 53.37s/it]
Evaluating: 100%|██████████| 2/2 [00:22<00:00, 11.04s/it]


Epoch 2/5:
Train Loss: 54.1001 | Val Loss: 35.4148


Training: 100%|██████████| 5/5 [04:25<00:00, 53.12s/it]
Evaluating: 100%|██████████| 2/2 [00:22<00:00, 11.10s/it]


Epoch 3/5:
Train Loss: 40.6571 | Val Loss: 45.8305


Training: 100%|██████████| 5/5 [04:23<00:00, 52.65s/it]
Evaluating: 100%|██████████| 2/2 [00:22<00:00, 11.08s/it]


Epoch 4/5:
Train Loss: 42.5359 | Val Loss: 41.8926


Training: 100%|██████████| 5/5 [04:23<00:00, 52.75s/it]
Evaluating: 100%|██████████| 2/2 [00:22<00:00, 11.13s/it]
[I 2025-05-07 09:52:16,491] Trial 8 finished with value: 20.31082820892334 and parameters: {'d_model': 1024, 'num_heads': 16, 'num_layers': 24, 'd_ff': 1024, 'dropout': 0.14245868486854812, 'learning_rate': 0.0506689470459807, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 29.8232 | Val Loss: 20.3108


Training: 100%|██████████| 5/5 [00:27<00:00,  5.56s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]


Epoch 1/5:
Train Loss: 3.6665 | Val Loss: 3.2551


Training: 100%|██████████| 5/5 [00:27<00:00,  5.46s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]


Epoch 2/5:
Train Loss: 3.1838 | Val Loss: 3.1833


Training: 100%|██████████| 5/5 [00:27<00:00,  5.51s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]


Epoch 3/5:
Train Loss: 3.1555 | Val Loss: 3.1381


Training: 100%|██████████| 5/5 [00:31<00:00,  6.39s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]


Epoch 4/5:
Train Loss: 3.1434 | Val Loss: 3.1589


Training: 100%|██████████| 5/5 [00:27<00:00,  5.58s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]


Epoch 5/5:
Train Loss: 3.1165 | Val Loss: 3.1400


[I 2025-05-07 09:54:53,090] Trial 9 finished with value: 3.1381282806396484 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 24, 'd_ff': 1024, 'dropout': 0.3096051439138539, 'learning_rate': 0.007355223630510876, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.
Training: 100%|██████████| 5/5 [05:06<00:00, 61.32s/it]
Evaluating: 100%|██████████| 2/2 [00:26<00:00, 13.08s/it]


Epoch 1/5:
Train Loss: 3.9438 | Val Loss: 3.3875


Training: 100%|██████████| 5/5 [04:46<00:00, 57.24s/it]
Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.70s/it]


Epoch 2/5:
Train Loss: 3.2682 | Val Loss: 3.2215


Training: 100%|██████████| 5/5 [04:41<00:00, 56.22s/it]
Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.62s/it]


Epoch 3/5:
Train Loss: 3.1916 | Val Loss: 3.1491


Training: 100%|██████████| 5/5 [04:43<00:00, 56.68s/it]
Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.70s/it]


Epoch 4/5:
Train Loss: 3.1725 | Val Loss: 3.1712


Training: 100%|██████████| 5/5 [04:50<00:00, 58.04s/it]
Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.64s/it]
[I 2025-05-07 10:20:53,388] Trial 10 finished with value: 3.1366809606552124 and parameters: {'d_model': 512, 'num_heads': 32, 'num_layers': 48, 'd_ff': 512, 'dropout': 0.17387476284353573, 'learning_rate': 0.00013098820652767115, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.1482 | Val Loss: 3.1367


Training: 100%|██████████| 5/5 [00:48<00:00,  9.78s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.75s/it]


Epoch 1/5:
Train Loss: 3.9881 | Val Loss: 3.6175


Training: 100%|██████████| 5/5 [00:48<00:00,  9.68s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.59s/it]


Epoch 2/5:
Train Loss: 3.5750 | Val Loss: 3.4903


Training: 100%|██████████| 5/5 [00:47<00:00,  9.58s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.81s/it]


Epoch 3/5:
Train Loss: 3.4655 | Val Loss: 3.4067


Training: 100%|██████████| 5/5 [00:47<00:00,  9.54s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.63s/it]


Epoch 4/5:
Train Loss: 3.3738 | Val Loss: 3.3255


Training: 100%|██████████| 5/5 [00:47<00:00,  9.44s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.61s/it]
[I 2025-05-07 10:25:20,451] Trial 11 finished with value: 3.2394295930862427 and parameters: {'d_model': 128, 'num_heads': 64, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.14604254399345204, 'learning_rate': 0.0002843364177049067, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.2933 | Val Loss: 3.2394


Training: 100%|██████████| 5/5 [01:33<00:00, 18.62s/it]
Evaluating: 100%|██████████| 2/2 [00:07<00:00,  3.85s/it]


Epoch 1/5:
Train Loss: 3.9864 | Val Loss: 3.3347


Training: 100%|██████████| 5/5 [01:17<00:00, 15.60s/it]
Evaluating: 100%|██████████| 2/2 [00:07<00:00,  3.87s/it]


Epoch 2/5:
Train Loss: 3.2617 | Val Loss: 3.2289


Training: 100%|██████████| 5/5 [01:17<00:00, 15.48s/it]
Evaluating: 100%|██████████| 2/2 [00:07<00:00,  3.67s/it]


Epoch 3/5:
Train Loss: 3.1817 | Val Loss: 3.1551


Training: 100%|██████████| 5/5 [01:27<00:00, 17.58s/it]
Evaluating: 100%|██████████| 2/2 [00:07<00:00,  3.74s/it]


Epoch 4/5:
Train Loss: 3.1358 | Val Loss: 3.1662


Training: 100%|██████████| 5/5 [01:18<00:00, 15.67s/it]
Evaluating: 100%|██████████| 2/2 [00:07<00:00,  3.89s/it]
[I 2025-05-07 10:32:53,542] Trial 12 finished with value: 3.1335554122924805 and parameters: {'d_model': 512, 'num_heads': 64, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.05247997610514199, 'learning_rate': 0.00015824032268813015, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.1328 | Val Loss: 3.1336


Training: 100%|██████████| 5/5 [01:02<00:00, 12.49s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.21s/it]


Epoch 1/5:
Train Loss: 4.1221 | Val Loss: 3.5186


Training: 100%|██████████| 5/5 [00:59<00:00, 11.89s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.14s/it]


Epoch 2/5:
Train Loss: 3.5104 | Val Loss: 3.3676


Training: 100%|██████████| 5/5 [00:59<00:00, 11.80s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.15s/it]


Epoch 3/5:
Train Loss: 3.3991 | Val Loss: 3.3098


Training: 100%|██████████| 5/5 [00:58<00:00, 11.67s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.98s/it]


Epoch 4/5:
Train Loss: 3.3514 | Val Loss: 3.2757


Training: 100%|██████████| 5/5 [00:58<00:00, 11.61s/it]
Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.12s/it]
[I 2025-05-07 10:38:22,272] Trial 13 finished with value: 3.2346073389053345 and parameters: {'d_model': 256, 'num_heads': 64, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.1989375857863354, 'learning_rate': 3.597719751062505e-05, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.2975 | Val Loss: 3.2346


Training: 100%|██████████| 5/5 [00:22<00:00,  4.51s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Epoch 1/5:
Train Loss: 4.0489 | Val Loss: 3.5945


Training: 100%|██████████| 5/5 [00:26<00:00,  5.26s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


Epoch 2/5:
Train Loss: 3.5354 | Val Loss: 3.4152


Training: 100%|██████████| 5/5 [00:22<00:00,  4.57s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Epoch 3/5:
Train Loss: 3.2963 | Val Loss: 3.1742


Training: 100%|██████████| 5/5 [00:22<00:00,  4.58s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]


Epoch 4/5:
Train Loss: 3.0559 | Val Loss: 3.0068


Training: 100%|██████████| 5/5 [00:26<00:00,  5.39s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.77s/it]
[I 2025-05-07 10:40:37,244] Trial 14 finished with value: 2.8626458644866943 and parameters: {'d_model': 128, 'num_heads': 32, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.1027490625168909, 'learning_rate': 0.00040449135367057486, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 2.8466 | Val Loss: 2.8626


Training: 100%|██████████| 5/5 [00:26<00:00,  5.31s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Epoch 1/5:
Train Loss: 3.9852 | Val Loss: 3.5351


Training: 100%|██████████| 5/5 [00:22<00:00,  4.51s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


Epoch 2/5:
Train Loss: 3.4886 | Val Loss: 3.3494


Training: 100%|██████████| 5/5 [00:22<00:00,  4.55s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


Epoch 3/5:
Train Loss: 3.3207 | Val Loss: 3.2393


Training: 100%|██████████| 5/5 [00:22<00:00,  4.52s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]


Epoch 4/5:
Train Loss: 3.2212 | Val Loss: 3.1692


Training: 100%|██████████| 5/5 [00:22<00:00,  4.44s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]
[I 2025-05-07 10:42:46,692] Trial 15 finished with value: 3.145572781562805 and parameters: {'d_model': 128, 'num_heads': 32, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.3763254324852493, 'learning_rate': 0.0006492165720201931, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.1771 | Val Loss: 3.1456


Training: 100%|██████████| 5/5 [00:39<00:00,  7.83s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.56s/it]


Epoch 1/5:
Train Loss: 4.0555 | Val Loss: 3.5083


Training: 100%|██████████| 5/5 [00:32<00:00,  6.46s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.56s/it]


Epoch 2/5:
Train Loss: 3.4473 | Val Loss: 3.3467


Training: 100%|██████████| 5/5 [00:32<00:00,  6.43s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.56s/it]


Epoch 3/5:
Train Loss: 3.3255 | Val Loss: 3.2700


Training: 100%|██████████| 5/5 [00:32<00:00,  6.44s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.59s/it]


Epoch 4/5:
Train Loss: 3.2555 | Val Loss: 3.1976


Training: 100%|██████████| 5/5 [00:32<00:00,  6.41s/it]
Evaluating: 100%|██████████| 2/2 [00:03<00:00,  1.55s/it]
[I 2025-05-07 10:45:50,387] Trial 16 finished with value: 3.0885809659957886 and parameters: {'d_model': 256, 'num_heads': 32, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.12546131819846199, 'learning_rate': 5.0675551279102695e-05, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.1515 | Val Loss: 3.0886


Training: 100%|██████████| 5/5 [04:34<00:00, 54.88s/it]
Evaluating: 100%|██████████| 2/2 [00:20<00:00, 10.36s/it]


Epoch 1/5:
Train Loss: 3.9578 | Val Loss: 3.3969


Training: 100%|██████████| 5/5 [04:24<00:00, 52.93s/it]
Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.52s/it]


Epoch 2/5:
Train Loss: 3.3173 | Val Loss: 3.2668


Training: 100%|██████████| 5/5 [04:22<00:00, 52.53s/it]
Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.50s/it]


Epoch 3/5:
Train Loss: 3.2394 | Val Loss: 3.1897


Training: 100%|██████████| 5/5 [04:15<00:00, 51.20s/it]
Evaluating: 100%|██████████| 2/2 [00:20<00:00, 10.36s/it]


Epoch 4/5:
Train Loss: 3.1974 | Val Loss: 3.1896


Training: 100%|██████████| 5/5 [04:27<00:00, 53.43s/it]
Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.55s/it]
[I 2025-05-07 11:09:40,997] Trial 17 finished with value: 3.1657798290252686 and parameters: {'d_model': 512, 'num_heads': 32, 'num_layers': 48, 'd_ff': 512, 'dropout': 0.23666788716478213, 'learning_rate': 7.31970107215787e-05, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.1790 | Val Loss: 3.1658


Training: 100%|██████████| 5/5 [00:07<00:00,  1.53s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  3.18it/s]


Epoch 1/5:
Train Loss: 4.7000 | Val Loss: 4.4264


Training: 100%|██████████| 5/5 [00:07<00:00,  1.45s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  2.68it/s]


Epoch 2/5:
Train Loss: 4.3099 | Val Loss: 4.1316


Training: 100%|██████████| 5/5 [00:07<00:00,  1.51s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  2.98it/s]


Epoch 3/5:
Train Loss: 4.0780 | Val Loss: 3.9634


Training: 100%|██████████| 5/5 [00:07<00:00,  1.53s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  3.13it/s]


Epoch 4/5:
Train Loss: 3.9301 | Val Loss: 3.8492


Training: 100%|██████████| 5/5 [00:07<00:00,  1.50s/it]
Evaluating: 100%|██████████| 2/2 [00:00<00:00,  3.12it/s]
[I 2025-05-07 11:10:21,986] Trial 18 finished with value: 3.7663902044296265 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.11165614594826444, 'learning_rate': 1.4463366864297413e-05, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.8328 | Val Loss: 3.7664


Training: 100%|██████████| 5/5 [00:28<00:00,  5.76s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]


Epoch 1/5:
Train Loss: 3.8924 | Val Loss: 3.3864


Training: 100%|██████████| 5/5 [00:29<00:00,  5.97s/it]
Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.56s/it]


Epoch 2/5:
Train Loss: 3.2570 | Val Loss: 3.1693


Training: 100%|██████████| 5/5 [00:36<00:00,  7.27s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]


Epoch 3/5:
Train Loss: 3.1536 | Val Loss: 3.1337


Training: 100%|██████████| 5/5 [00:29<00:00,  5.80s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]


Epoch 4/5:
Train Loss: 3.1365 | Val Loss: 3.1390


Training: 100%|██████████| 5/5 [00:29<00:00,  5.92s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
[I 2025-05-07 11:13:12,344] Trial 19 finished with value: 3.1151251792907715 and parameters: {'d_model': 256, 'num_heads': 32, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.19053928707810275, 'learning_rate': 0.0008111895286008717, 'batch_size': 16}. Best is trial 5 with value: 2.784000277519226.


Epoch 5/5:
Train Loss: 3.1216 | Val Loss: 3.1151

Best trial:
  Validation Loss: 2.7840
  Params: 
    d_model: 256
    num_heads: 32
    num_layers: 8
    d_ff: 1024
    dropout: 0.1323467788673861
    learning_rate: 0.00023489726805644886
    batch_size: 16


Evaluating: 100%|██████████| 5/5 [00:10<00:00,  2.07s/it]
Evaluating: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]



Final Evaluation:
Train Loss: 2.5061 | Val Loss: 2.7840
Train Accuracy: 0.2925 | Val Accuracy: 0.2179

Test Decryptions:
Input: 'Please decrypt the following using Caesar cipher: gfbs' | Output: 'Tom an the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the ' | Expected: 'fear' | ✗
Input: 'Please decrypt the following using Caesar cipher: dpnqvufs' | Output: 'Tom an the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the ' | Expected: 'computer' | ✗
Input: 'Please decrypt the following using Caesar cipher:xibu' | Output: 'Tom an the the the the the the the the the the the the the the the the the the the the