In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Preparation

def load_data(file_path):
    df = pd.read_excel(file_path)
    df = df[df['output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)
    if count_filtered > 10000:
        df = df.sample(n=10000, random_state=42)

    inputs = df['input'].tolist()
    outputs = df['output'].tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break
    return best_val_loss  # Return the best validation loss from this training run

# Global variables to track best model across all trials
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Hyperparameter Optimization with Optuna
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [128, 256, 512]),
        "num_heads": trial.suggest_categorical("num_heads", [2, 4, 8, 16]),
        "num_layers": trial.suggest_categorical("num_layers", [6, 8, 10, 12]),
        "d_ff": trial.suggest_categorical("d_ff", [256, 512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss for this configuration
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_mono_key_100.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('/content/Full_training_mono_100.xlsx')
    vocab = Vocabulary()
    max_length = 256  # Adjusted for longer sentences

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)  # 20 trials or 1 hour

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('/content/drive/MyDrive/best_mono_key_100.pth'))

    # Evaluate on full datasets
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher:xibu", "what")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cuda


[I 2025-05-09 19:52:40,792] A new study created in memory with name: no-name-07a175f3-597f-4492-9f4c-8f7fbde349b9
Training: 100%|██████████| 250/250 [00:45<00:00,  5.46it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.25it/s]


Epoch 1/10:
Train Loss: 2.4772 | Val Loss: 2.0375


Training: 100%|██████████| 250/250 [00:44<00:00,  5.63it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.17it/s]


Epoch 2/10:
Train Loss: 1.9566 | Val Loss: 1.8318


Training: 100%|██████████| 250/250 [00:44<00:00,  5.63it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.24it/s]


Epoch 3/10:
Train Loss: 1.7931 | Val Loss: 1.7225


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.23it/s]


Epoch 4/10:
Train Loss: 1.6940 | Val Loss: 1.6686


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.24it/s]


Epoch 5/10:
Train Loss: 1.6190 | Val Loss: 1.6483


Training: 100%|██████████| 250/250 [00:44<00:00,  5.63it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.23it/s]


Epoch 6/10:
Train Loss: 1.5586 | Val Loss: 1.5714


Training: 100%|██████████| 250/250 [00:44<00:00,  5.63it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.23it/s]


Epoch 7/10:
Train Loss: 1.5075 | Val Loss: 1.4907


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.26it/s]


Epoch 8/10:
Train Loss: 1.4600 | Val Loss: 1.4987


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.27it/s]


Epoch 9/10:
Train Loss: 1.4213 | Val Loss: 1.4105


Training: 100%|██████████| 250/250 [00:44<00:00,  5.63it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 10/10:
Train Loss: 1.3810 | Val Loss: 1.4024


[I 2025-05-09 20:00:52,866] Trial 0 finished with value: 1.4023722712955777 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.3656279943630909, 'learning_rate': 0.0003096108616676357, 'batch_size': 32}. Best is trial 0 with value: 1.4023722712955777.


New best model found! Val Loss: 1.4024
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.3656279943630909, 'learning_rate': 0.0003096108616676357, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:26<00:00,  9.56it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.89it/s]


Epoch 1/10:
Train Loss: 3.0120 | Val Loss: 3.0165


Training: 100%|██████████| 250/250 [00:26<00:00,  9.52it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.98it/s]


Epoch 2/10:
Train Loss: 2.9582 | Val Loss: 3.4401


Training: 100%|██████████| 250/250 [00:26<00:00,  9.48it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.90it/s]


Epoch 3/10:
Train Loss: 2.9200 | Val Loss: 3.5491


Training: 100%|██████████| 250/250 [00:26<00:00,  9.54it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.81it/s]
[I 2025-05-09 20:02:47,705] Trial 1 finished with value: 3.0165125264061823 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.19041916262694247, 'learning_rate': 0.0015249725221591394, 'batch_size': 32}. Best is trial 0 with value: 1.4023722712955777.


Epoch 4/10:
Train Loss: 2.9017 | Val Loss: 3.6546
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:55<00:00,  4.54it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.28it/s]


Epoch 1/10:
Train Loss: 3.0502 | Val Loss: 2.9864


Training: 100%|██████████| 250/250 [00:55<00:00,  4.54it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.24it/s]


Epoch 2/10:
Train Loss: 2.9832 | Val Loss: 2.9808


Training: 100%|██████████| 250/250 [00:55<00:00,  4.54it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.27it/s]


Epoch 3/10:
Train Loss: 2.9801 | Val Loss: 2.9785


Training: 100%|██████████| 250/250 [00:55<00:00,  4.54it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.26it/s]


Epoch 4/10:
Train Loss: 2.9776 | Val Loss: 2.9741


Training: 100%|██████████| 250/250 [00:55<00:00,  4.52it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.27it/s]


Epoch 5/10:
Train Loss: 2.9780 | Val Loss: 2.9751


Training: 100%|██████████| 250/250 [00:55<00:00,  4.53it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.24it/s]


Epoch 6/10:
Train Loss: 2.9769 | Val Loss: 2.9759


Training: 100%|██████████| 250/250 [00:55<00:00,  4.53it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.23it/s]
[I 2025-05-09 20:09:47,271] Trial 2 finished with value: 2.9741036551339284 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.2906683845461831, 'learning_rate': 0.0018997374548927, 'batch_size': 32}. Best is trial 0 with value: 1.4023722712955777.


Epoch 7/10:
Train Loss: 2.9750 | Val Loss: 3.0459
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:14<00:00, 17.10it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.86it/s]


Epoch 1/10:
Train Loss: 2.3482 | Val Loss: 2.0600


Training: 100%|██████████| 250/250 [00:14<00:00, 17.17it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.29it/s]


Epoch 2/10:
Train Loss: 1.9853 | Val Loss: 1.8709


Training: 100%|██████████| 250/250 [00:14<00:00, 17.20it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.38it/s]


Epoch 3/10:
Train Loss: 1.8260 | Val Loss: 1.7353


Training: 100%|██████████| 250/250 [00:14<00:00, 17.18it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.12it/s]


Epoch 4/10:
Train Loss: 1.7105 | Val Loss: 1.6349


Training: 100%|██████████| 250/250 [00:14<00:00, 17.18it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.18it/s]


Epoch 5/10:
Train Loss: 1.6116 | Val Loss: 1.5393


Training: 100%|██████████| 250/250 [00:14<00:00, 17.22it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.20it/s]


Epoch 6/10:
Train Loss: 1.5369 | Val Loss: 1.5039


Training: 100%|██████████| 250/250 [00:14<00:00, 17.34it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.91it/s]


Epoch 7/10:
Train Loss: 1.4716 | Val Loss: 1.4520


Training: 100%|██████████| 250/250 [00:14<00:00, 17.36it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.97it/s]


Epoch 8/10:
Train Loss: 1.4130 | Val Loss: 1.3679


Training: 100%|██████████| 250/250 [00:14<00:00, 17.30it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.09it/s]


Epoch 9/10:
Train Loss: 1.3622 | Val Loss: 1.3125


Training: 100%|██████████| 250/250 [00:14<00:00, 17.32it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.36it/s]
[I 2025-05-09 20:12:26,570] Trial 3 finished with value: 1.2743188994271415 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.17064200200205557, 'learning_rate': 0.00047800886583501884, 'batch_size': 32}. Best is trial 3 with value: 1.2743188994271415.


Epoch 10/10:
Train Loss: 1.3106 | Val Loss: 1.2743
New best model found! Val Loss: 1.2743
Config: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.17064200200205557, 'learning_rate': 0.00047800886583501884, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 1/10:
Train Loss: 2.3753 | Val Loss: 2.1425


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.19it/s]


Epoch 2/10:
Train Loss: 2.0367 | Val Loss: 2.0661


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.41it/s]


Epoch 3/10:
Train Loss: 1.8914 | Val Loss: 1.8341


Training: 100%|██████████| 250/250 [00:37<00:00,  6.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.20it/s]


Epoch 4/10:
Train Loss: 1.7886 | Val Loss: 1.7179


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 5/10:
Train Loss: 1.7032 | Val Loss: 1.7156


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.48it/s]


Epoch 6/10:
Train Loss: 1.6315 | Val Loss: 1.5957


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 7/10:
Train Loss: 1.5619 | Val Loss: 1.5306


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 8/10:
Train Loss: 1.5002 | Val Loss: 1.4476


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.25it/s]


Epoch 9/10:
Train Loss: 1.4419 | Val Loss: 1.3847


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.46it/s]
[I 2025-05-09 20:19:12,763] Trial 4 finished with value: 1.327687006148081 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.3717471107049085, 'learning_rate': 0.00012705204622217794, 'batch_size': 32}. Best is trial 3 with value: 1.2743188994271415.


Epoch 10/10:
Train Loss: 1.3889 | Val Loss: 1.3277


Training: 100%|██████████| 250/250 [00:43<00:00,  5.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 1/10:
Train Loss: 2.3405 | Val Loss: 1.9928


Training: 100%|██████████| 250/250 [00:43<00:00,  5.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.62it/s]


Epoch 2/10:
Train Loss: 1.8529 | Val Loss: 1.7237


Training: 100%|██████████| 250/250 [00:43<00:00,  5.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.67it/s]


Epoch 3/10:
Train Loss: 1.6479 | Val Loss: 1.5759


Training: 100%|██████████| 250/250 [00:43<00:00,  5.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 4/10:
Train Loss: 1.5197 | Val Loss: 1.4808


Training: 100%|██████████| 250/250 [00:43<00:00,  5.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.40it/s]


Epoch 5/10:
Train Loss: 1.4226 | Val Loss: 1.4263


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 6/10:
Train Loss: 1.3418 | Val Loss: 1.3576


Training: 100%|██████████| 250/250 [00:43<00:00,  5.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.62it/s]


Epoch 7/10:
Train Loss: 1.2686 | Val Loss: 1.3094


Training: 100%|██████████| 250/250 [00:43<00:00,  5.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.63it/s]


Epoch 8/10:
Train Loss: 1.2039 | Val Loss: 1.2600


Training: 100%|██████████| 250/250 [00:43<00:00,  5.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 9/10:
Train Loss: 1.1357 | Val Loss: 1.2316


Training: 100%|██████████| 250/250 [00:43<00:00,  5.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 10/10:
Train Loss: 1.0674 | Val Loss: 1.1912


[I 2025-05-09 20:27:04,422] Trial 5 finished with value: 1.1912459617569333 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.13294049297188448, 'learning_rate': 0.00043592490673089113, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


New best model found! Val Loss: 1.1912
Config: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.13294049297188448, 'learning_rate': 0.00043592490673089113, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:40<00:00,  6.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.99it/s]


Epoch 1/10:
Train Loss: 3.0117 | Val Loss: 2.9886


Training: 100%|██████████| 250/250 [00:39<00:00,  6.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.05it/s]


Epoch 2/10:
Train Loss: 2.9821 | Val Loss: 2.9860


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.16it/s]


Epoch 3/10:
Train Loss: 2.9792 | Val Loss: 2.9911


Training: 100%|██████████| 250/250 [00:40<00:00,  6.24it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.06it/s]


Epoch 4/10:
Train Loss: 2.9786 | Val Loss: 2.9802


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.12it/s]


Epoch 5/10:
Train Loss: 2.9776 | Val Loss: 2.9771


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.11it/s]


Epoch 6/10:
Train Loss: 2.9760 | Val Loss: 3.0492


Training: 100%|██████████| 250/250 [00:40<00:00,  6.22it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.81it/s]


Epoch 7/10:
Train Loss: 2.9733 | Val Loss: 3.3087


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.05it/s]


Epoch 8/10:
Train Loss: 2.9732 | Val Loss: 2.9749


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.01it/s]


Epoch 9/10:
Train Loss: 2.9771 | Val Loss: 2.9793


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 23.11it/s]
[I 2025-05-09 20:34:12,094] Trial 6 finished with value: 2.974858291565426 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.369305759471918, 'learning_rate': 0.003932120293986045, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 10/10:
Train Loss: 2.9721 | Val Loss: 3.4051


Training: 100%|██████████| 250/250 [00:49<00:00,  5.05it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.27it/s]


Epoch 1/10:
Train Loss: 3.0084 | Val Loss: 2.9789


Training: 100%|██████████| 250/250 [00:49<00:00,  5.05it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.27it/s]


Epoch 2/10:
Train Loss: 2.9457 | Val Loss: 3.4875


Training: 100%|██████████| 250/250 [00:49<00:00,  5.05it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.29it/s]


Epoch 3/10:
Train Loss: 2.9177 | Val Loss: 3.5676


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.29it/s]
[I 2025-05-09 20:37:47,912] Trial 7 finished with value: 2.978905882154192 and parameters: {'d_model': 128, 'num_heads': 16, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.10885825328524819, 'learning_rate': 0.004166458660351209, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 4/10:
Train Loss: 2.9099 | Val Loss: 3.4498
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:30<00:00,  8.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.71it/s]


Epoch 1/10:
Train Loss: 2.4233 | Val Loss: 2.1614


Training: 100%|██████████| 250/250 [00:30<00:00,  8.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.85it/s]


Epoch 2/10:
Train Loss: 2.0931 | Val Loss: 1.9707


Training: 100%|██████████| 250/250 [00:30<00:00,  8.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.94it/s]


Epoch 3/10:
Train Loss: 1.9584 | Val Loss: 1.8474


Training: 100%|██████████| 250/250 [00:30<00:00,  8.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.53it/s]


Epoch 4/10:
Train Loss: 1.8637 | Val Loss: 1.8004


Training: 100%|██████████| 250/250 [00:30<00:00,  8.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.74it/s]


Epoch 5/10:
Train Loss: 1.7847 | Val Loss: 1.7622


Training: 100%|██████████| 250/250 [00:30<00:00,  8.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.71it/s]


Epoch 6/10:
Train Loss: 1.7165 | Val Loss: 1.6503


Training: 100%|██████████| 250/250 [00:30<00:00,  8.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.80it/s]


Epoch 7/10:
Train Loss: 1.6551 | Val Loss: 1.6254


Training: 100%|██████████| 250/250 [00:30<00:00,  8.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.23it/s]


Epoch 8/10:
Train Loss: 1.6047 | Val Loss: 1.6248


Training: 100%|██████████| 250/250 [00:30<00:00,  8.26it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.72it/s]


Epoch 9/10:
Train Loss: 1.5588 | Val Loss: 1.5090


Training: 100%|██████████| 250/250 [00:30<00:00,  8.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.74it/s]
[I 2025-05-09 20:43:12,037] Trial 8 finished with value: 1.5090469216543532 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.3541589987963768, 'learning_rate': 0.00019736896260089134, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 10/10:
Train Loss: 1.5166 | Val Loss: 1.5350


Training: 100%|██████████| 250/250 [00:23<00:00, 10.87it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.66it/s]


Epoch 1/10:
Train Loss: 2.3598 | Val Loss: 2.0928


Training: 100%|██████████| 250/250 [00:22<00:00, 10.87it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.66it/s]


Epoch 2/10:
Train Loss: 2.0064 | Val Loss: 1.8690


Training: 100%|██████████| 250/250 [00:22<00:00, 10.88it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.65it/s]


Epoch 3/10:
Train Loss: 1.8414 | Val Loss: 1.7395


Training: 100%|██████████| 250/250 [00:22<00:00, 10.88it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.66it/s]


Epoch 4/10:
Train Loss: 1.7250 | Val Loss: 1.6288


Training: 100%|██████████| 250/250 [00:23<00:00, 10.86it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.82it/s]


Epoch 5/10:
Train Loss: 1.6324 | Val Loss: 1.5524


Training: 100%|██████████| 250/250 [00:23<00:00, 10.86it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.97it/s]


Epoch 6/10:
Train Loss: 1.5538 | Val Loss: 1.4777


Training: 100%|██████████| 250/250 [00:23<00:00, 10.86it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.69it/s]


Epoch 7/10:
Train Loss: 1.4876 | Val Loss: 1.4264


Training: 100%|██████████| 250/250 [00:23<00:00, 10.85it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.77it/s]


Epoch 8/10:
Train Loss: 1.4336 | Val Loss: 1.3635


Training: 100%|██████████| 250/250 [00:23<00:00, 10.86it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.55it/s]


Epoch 9/10:
Train Loss: 1.3812 | Val Loss: 1.3108


Training: 100%|██████████| 250/250 [00:23<00:00, 10.86it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.80it/s]
[I 2025-05-09 20:47:19,457] Trial 9 finished with value: 1.2728162795778304 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.20611464986385855, 'learning_rate': 0.00014734911868338833, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 10/10:
Train Loss: 1.3339 | Val Loss: 1.2728


Training: 100%|██████████| 250/250 [00:59<00:00,  4.23it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.96it/s]


Epoch 1/10:
Train Loss: 3.0136 | Val Loss: 2.9822


Training: 100%|██████████| 250/250 [00:59<00:00,  4.23it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.97it/s]


Epoch 2/10:
Train Loss: 2.9809 | Val Loss: 3.0713


Training: 100%|██████████| 250/250 [00:59<00:00,  4.22it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.98it/s]


Epoch 3/10:
Train Loss: 2.9483 | Val Loss: 3.5078


Training: 100%|██████████| 250/250 [00:59<00:00,  4.22it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.96it/s]
[I 2025-05-09 20:51:34,211] Trial 10 finished with value: 2.9821545320843894 and parameters: {'d_model': 256, 'num_heads': 16, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.10098980758922844, 'learning_rate': 0.0006883947002608255, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 4/10:
Train Loss: 2.9167 | Val Loss: 3.9156
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:23<00:00, 10.83it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.71it/s]


Epoch 1/10:
Train Loss: 2.4141 | Val Loss: 2.1288


Training: 100%|██████████| 250/250 [00:23<00:00, 10.84it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.25it/s]


Epoch 2/10:
Train Loss: 2.0616 | Val Loss: 1.9556


Training: 100%|██████████| 250/250 [00:23<00:00, 10.83it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.40it/s]


Epoch 3/10:
Train Loss: 1.9136 | Val Loss: 1.8229


Training: 100%|██████████| 250/250 [00:23<00:00, 10.86it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.75it/s]


Epoch 4/10:
Train Loss: 1.8002 | Val Loss: 1.6960


Training: 100%|██████████| 250/250 [00:23<00:00, 10.86it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.98it/s]


Epoch 5/10:
Train Loss: 1.7144 | Val Loss: 1.6348


Training: 100%|██████████| 250/250 [00:23<00:00, 10.85it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.43it/s]


Epoch 6/10:
Train Loss: 1.6423 | Val Loss: 1.5484


Training: 100%|██████████| 250/250 [00:23<00:00, 10.87it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.79it/s]


Epoch 7/10:
Train Loss: 1.5810 | Val Loss: 1.5124


Training: 100%|██████████| 250/250 [00:22<00:00, 10.87it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.89it/s]


Epoch 8/10:
Train Loss: 1.5263 | Val Loss: 1.4371


Training: 100%|██████████| 250/250 [00:22<00:00, 10.89it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 36.83it/s]


Epoch 9/10:
Train Loss: 1.4780 | Val Loss: 1.4130


Training: 100%|██████████| 250/250 [00:22<00:00, 10.89it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 37.05it/s]
[I 2025-05-09 20:55:41,770] Trial 11 finished with value: 1.3643299397968112 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.20810667441301678, 'learning_rate': 0.00011298604757599125, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 10/10:
Train Loss: 1.4280 | Val Loss: 1.3643


Training: 100%|██████████| 250/250 [00:29<00:00,  8.47it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.56it/s]


Epoch 1/10:
Train Loss: 2.3196 | Val Loss: 2.0795


Training: 100%|██████████| 250/250 [00:29<00:00,  8.48it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.21it/s]


Epoch 2/10:
Train Loss: 1.9683 | Val Loss: 1.8483


Training: 100%|██████████| 250/250 [00:29<00:00,  8.48it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.80it/s]


Epoch 3/10:
Train Loss: 1.7986 | Val Loss: 1.6843


Training: 100%|██████████| 250/250 [00:29<00:00,  8.48it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.89it/s]


Epoch 4/10:
Train Loss: 1.6764 | Val Loss: 1.5911


Training: 100%|██████████| 250/250 [00:29<00:00,  8.49it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.79it/s]


Epoch 5/10:
Train Loss: 1.5790 | Val Loss: 1.5145


Training: 100%|██████████| 250/250 [00:29<00:00,  8.47it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.40it/s]


Epoch 6/10:
Train Loss: 1.4990 | Val Loss: 1.4591


Training: 100%|██████████| 250/250 [00:29<00:00,  8.49it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.86it/s]


Epoch 7/10:
Train Loss: 1.4315 | Val Loss: 1.3926


Training: 100%|██████████| 250/250 [00:29<00:00,  8.47it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.74it/s]


Epoch 8/10:
Train Loss: 1.3731 | Val Loss: 1.3637


Training: 100%|██████████| 250/250 [00:29<00:00,  8.46it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.72it/s]


Epoch 9/10:
Train Loss: 1.3193 | Val Loss: 1.3124


Training: 100%|██████████| 250/250 [00:29<00:00,  8.47it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.80it/s]
[I 2025-05-09 21:00:59,556] Trial 12 finished with value: 1.264653412122575 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.2548848944992764, 'learning_rate': 0.00030022057077537745, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 10/10:
Train Loss: 1.2706 | Val Loss: 1.2647


Training: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.58it/s]


Epoch 1/10:
Train Loss: 3.0219 | Val Loss: 2.9811


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.66it/s]


Epoch 2/10:
Train Loss: 2.5129 | Val Loss: 2.2762


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.57it/s]


Epoch 3/10:
Train Loss: 2.1680 | Val Loss: 2.0707


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.49it/s]


Epoch 4/10:
Train Loss: 1.9968 | Val Loss: 2.1085


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.70it/s]


Epoch 5/10:
Train Loss: 1.8620 | Val Loss: 2.5726


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.60it/s]
[I 2025-05-09 21:06:08,882] Trial 13 finished with value: 2.070655200216505 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.277652410187455, 'learning_rate': 0.0004357901875439269, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 6/10:
Train Loss: 1.7501 | Val Loss: 3.7286
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:27<00:00,  9.22it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.62it/s]


Epoch 1/10:
Train Loss: 2.5336 | Val Loss: 2.0637


Training: 100%|██████████| 250/250 [00:27<00:00,  9.20it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.47it/s]


Epoch 2/10:
Train Loss: 1.9947 | Val Loss: 1.8707


Training: 100%|██████████| 250/250 [00:27<00:00,  9.19it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.54it/s]


Epoch 3/10:
Train Loss: 1.8471 | Val Loss: 1.8619


Training: 100%|██████████| 250/250 [00:27<00:00,  9.20it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.51it/s]


Epoch 4/10:
Train Loss: 1.7422 | Val Loss: 2.6957


Training: 100%|██████████| 250/250 [00:27<00:00,  9.21it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.67it/s]


Epoch 5/10:
Train Loss: 1.6749 | Val Loss: 2.7651


Training: 100%|██████████| 250/250 [00:27<00:00,  9.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.51it/s]
[I 2025-05-09 21:09:04,340] Trial 14 finished with value: 1.8618928061591253 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.2529709930110266, 'learning_rate': 0.0008652967026576661, 'batch_size': 32}. Best is trial 5 with value: 1.1912459617569333.


Epoch 6/10:
Train Loss: 1.6057 | Val Loss: 3.7248
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.67it/s]


Epoch 1/10:
Train Loss: 2.3315 | Val Loss: 1.9880


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.63it/s]


Epoch 2/10:
Train Loss: 1.8927 | Val Loss: 1.7731


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.70it/s]


Epoch 3/10:
Train Loss: 1.7067 | Val Loss: 1.6324


Training: 100%|██████████| 250/250 [00:47<00:00,  5.23it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.71it/s]


Epoch 4/10:
Train Loss: 1.5828 | Val Loss: 1.5338


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.70it/s]


Epoch 5/10:
Train Loss: 1.4882 | Val Loss: 1.4626


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.66it/s]


Epoch 6/10:
Train Loss: 1.3973 | Val Loss: 1.3970


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.71it/s]


Epoch 7/10:
Train Loss: 1.3114 | Val Loss: 1.3438


Training: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.71it/s]


Epoch 8/10:
Train Loss: 1.2309 | Val Loss: 1.2802


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.69it/s]


Epoch 9/10:
Train Loss: 1.1577 | Val Loss: 1.2193


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.62it/s]


Epoch 10/10:
Train Loss: 1.0917 | Val Loss: 1.1693


[I 2025-05-09 21:17:39,784] Trial 15 finished with value: 1.1693088421745905 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.14215516083219887, 'learning_rate': 0.0002586360837268297, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


New best model found! Val Loss: 1.1693
Config: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.14215516083219887, 'learning_rate': 0.0002586360837268297, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.35it/s]


Epoch 1/10:
Train Loss: 3.0196 | Val Loss: 2.9818


Training: 100%|██████████| 250/250 [00:43<00:00,  5.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.38it/s]


Epoch 2/10:
Train Loss: 2.9801 | Val Loss: 2.9840


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 3/10:
Train Loss: 2.9788 | Val Loss: 2.9760


Training: 100%|██████████| 250/250 [00:44<00:00,  5.66it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 4/10:
Train Loss: 2.9783 | Val Loss: 2.9738


Training: 100%|██████████| 250/250 [00:44<00:00,  5.66it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.45it/s]


Epoch 5/10:
Train Loss: 2.9778 | Val Loss: 2.9727


Training: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.36it/s]


Epoch 6/10:
Train Loss: 2.9772 | Val Loss: 2.9788


Training: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 7/10:
Train Loss: 2.9743 | Val Loss: 3.0405


Training: 100%|██████████| 250/250 [00:44<00:00,  5.66it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.46it/s]


Epoch 8/10:
Train Loss: 2.9750 | Val Loss: 2.9725


Training: 100%|██████████| 250/250 [00:44<00:00,  5.66it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.43it/s]


Epoch 9/10:
Train Loss: 2.9735 | Val Loss: 3.0882


Training: 100%|██████████| 250/250 [00:44<00:00,  5.66it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]
[I 2025-05-09 21:25:33,555] Trial 16 finished with value: 2.972468436710418 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.15095170293714827, 'learning_rate': 0.009144864773512581, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 2.9714 | Val Loss: 3.0099


Training: 100%|██████████| 250/250 [00:48<00:00,  5.20it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Epoch 1/10:
Train Loss: 2.3530 | Val Loss: 2.0241


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.64it/s]


Epoch 2/10:
Train Loss: 1.9048 | Val Loss: 1.7693


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.67it/s]


Epoch 3/10:
Train Loss: 1.7118 | Val Loss: 1.6491


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.73it/s]


Epoch 4/10:
Train Loss: 1.5789 | Val Loss: 1.5463


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Epoch 5/10:
Train Loss: 1.4785 | Val Loss: 1.4689


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Epoch 6/10:
Train Loss: 1.3955 | Val Loss: 1.4054


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Epoch 7/10:
Train Loss: 1.3142 | Val Loss: 1.3644


Training: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Epoch 8/10:
Train Loss: 1.2410 | Val Loss: 1.2868


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.60it/s]


Epoch 9/10:
Train Loss: 1.1646 | Val Loss: 1.2276


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.57it/s]
[I 2025-05-09 21:34:08,798] Trial 17 finished with value: 1.176322375025068 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.14077057493106043, 'learning_rate': 0.0002615784696261575, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.0992 | Val Loss: 1.1763


Training: 100%|██████████| 250/250 [00:23<00:00, 10.56it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.40it/s]


Epoch 1/10:
Train Loss: 2.4597 | Val Loss: 2.1283


Training: 100%|██████████| 250/250 [00:23<00:00, 10.69it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.37it/s]


Epoch 2/10:
Train Loss: 2.0677 | Val Loss: 1.9529


Training: 100%|██████████| 250/250 [00:23<00:00, 10.61it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.33it/s]


Epoch 3/10:
Train Loss: 1.9300 | Val Loss: 1.8421


Training: 100%|██████████| 250/250 [00:23<00:00, 10.61it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.72it/s]


Epoch 4/10:
Train Loss: 1.8167 | Val Loss: 1.7555


Training: 100%|██████████| 250/250 [00:23<00:00, 10.83it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.31it/s]


Epoch 5/10:
Train Loss: 1.7192 | Val Loss: 1.6466


Training: 100%|██████████| 250/250 [00:23<00:00, 10.82it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.14it/s]


Epoch 6/10:
Train Loss: 1.6320 | Val Loss: 1.5716


Training: 100%|██████████| 250/250 [00:23<00:00, 10.71it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.50it/s]


Epoch 7/10:
Train Loss: 1.5625 | Val Loss: 1.5288


Training: 100%|██████████| 250/250 [00:23<00:00, 10.85it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.69it/s]


Epoch 8/10:
Train Loss: 1.5029 | Val Loss: 1.4640


Training: 100%|██████████| 250/250 [00:22<00:00, 10.88it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.56it/s]


Epoch 9/10:
Train Loss: 1.4457 | Val Loss: 1.4202


Training: 100%|██████████| 250/250 [00:22<00:00, 10.89it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.67it/s]
[I 2025-05-09 21:38:23,078] Trial 18 finished with value: 1.3931253542975774 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.1427189666508688, 'learning_rate': 0.0002185759924614362, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.3981 | Val Loss: 1.3931


Training: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.05it/s]


Epoch 1/10:
Train Loss: 3.0241 | Val Loss: 2.9802


Training: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.05it/s]


Epoch 2/10:
Train Loss: 2.9824 | Val Loss: 2.9799


Training: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.07it/s]


Epoch 3/10:
Train Loss: 2.9796 | Val Loss: 2.9762


Training: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.07it/s]


Epoch 4/10:
Train Loss: 2.9782 | Val Loss: 2.9753


Training: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.06it/s]


Epoch 5/10:
Train Loss: 2.9761 | Val Loss: 3.6838


Training: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.06it/s]


Epoch 6/10:
Train Loss: 2.9712 | Val Loss: 3.4772


Training: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]
Evaluating: 100%|██████████| 63/63 [00:06<00:00,  9.07it/s]
[I 2025-05-09 21:48:42,185] Trial 19 finished with value: 2.975288712789142 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.22093607183930714, 'learning_rate': 0.0013765620074579117, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 7/10:
Train Loss: 2.9683 | Val Loss: 3.5157
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.96it/s]


Epoch 1/10:
Train Loss: 2.5150 | Val Loss: 2.1607


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.93it/s]


Epoch 2/10:
Train Loss: 2.0847 | Val Loss: 1.9487


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.97it/s]


Epoch 3/10:
Train Loss: 1.9328 | Val Loss: 1.8415


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.96it/s]


Epoch 4/10:
Train Loss: 1.8205 | Val Loss: 1.7500


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.97it/s]


Epoch 5/10:
Train Loss: 1.7325 | Val Loss: 1.6883


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.91it/s]


Epoch 6/10:
Train Loss: 1.6620 | Val Loss: 1.6180


Training: 100%|██████████| 250/250 [00:57<00:00,  4.37it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.99it/s]


Epoch 7/10:
Train Loss: 1.6004 | Val Loss: 1.5668


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.98it/s]


Epoch 8/10:
Train Loss: 1.5430 | Val Loss: 1.5222


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.93it/s]


Epoch 9/10:
Train Loss: 1.4947 | Val Loss: 1.4795


Training: 100%|██████████| 250/250 [00:57<00:00,  4.38it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 14.97it/s]
[I 2025-05-09 21:58:55,564] Trial 20 finished with value: 1.4443430162611461 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.3301032339706264, 'learning_rate': 0.00023267414288829057, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.4510 | Val Loss: 1.4443


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.30it/s]


Epoch 1/10:
Train Loss: 2.6840 | Val Loss: 2.1254


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.43it/s]


Epoch 2/10:
Train Loss: 1.9768 | Val Loss: 1.8403


Training: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.45it/s]


Epoch 3/10:
Train Loss: 1.7626 | Val Loss: 1.6846


Training: 100%|██████████| 250/250 [00:43<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.45it/s]


Epoch 4/10:
Train Loss: 1.6270 | Val Loss: 1.5856


Training: 100%|██████████| 250/250 [00:43<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.41it/s]


Epoch 5/10:
Train Loss: 1.5486 | Val Loss: 1.5330


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.36it/s]


Epoch 6/10:
Train Loss: 1.4610 | Val Loss: 1.5107


Training: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.41it/s]


Epoch 7/10:
Train Loss: 1.3779 | Val Loss: 1.6160


Training: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.43it/s]


Epoch 8/10:
Train Loss: 1.3016 | Val Loss: 1.9550


Training: 100%|██████████| 250/250 [00:44<00:00,  5.66it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.33it/s]
[I 2025-05-09 22:06:01,499] Trial 21 finished with value: 1.5106652634484428 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.13406114618125584, 'learning_rate': 0.0004697162035697838, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 9/10:
Train Loss: 1.2406 | Val Loss: 2.2480
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.54it/s]


Epoch 1/10:
Train Loss: 3.0156 | Val Loss: 2.9774


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.66it/s]


Epoch 2/10:
Train Loss: 2.9753 | Val Loss: 3.2401


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Epoch 3/10:
Train Loss: 2.9379 | Val Loss: 3.7396


Training: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.63it/s]
[I 2025-05-09 22:09:27,917] Trial 22 finished with value: 2.9773621937585255 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.17167617134294488, 'learning_rate': 0.0006787039671355739, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 4/10:
Train Loss: 2.9143 | Val Loss: 3.8232
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.69it/s]


Epoch 1/10:
Train Loss: 2.3025 | Val Loss: 1.9912


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.70it/s]


Epoch 2/10:
Train Loss: 1.8718 | Val Loss: 1.7581


Training: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.54it/s]


Epoch 3/10:
Train Loss: 1.6878 | Val Loss: 1.6416


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.69it/s]


Epoch 4/10:
Train Loss: 1.5618 | Val Loss: 1.5495


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.58it/s]


Epoch 5/10:
Train Loss: 1.4632 | Val Loss: 1.4551


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.62it/s]


Epoch 6/10:
Train Loss: 1.3778 | Val Loss: 1.4109


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.64it/s]


Epoch 7/10:
Train Loss: 1.3164 | Val Loss: 1.3902


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.68it/s]


Epoch 8/10:
Train Loss: 1.2665 | Val Loss: 1.3399


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.68it/s]


Epoch 9/10:
Train Loss: 1.1896 | Val Loss: 1.3124


Training: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.66it/s]
[I 2025-05-09 22:18:03,290] Trial 23 finished with value: 1.3124182148585244 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.12440203383261297, 'learning_rate': 0.00035122922297391043, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.1189 | Val Loss: 1.3771


Training: 100%|██████████| 250/250 [00:43<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 1/10:
Train Loss: 2.3607 | Val Loss: 2.0716


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.20it/s]


Epoch 2/10:
Train Loss: 1.9945 | Val Loss: 1.8990


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.30it/s]


Epoch 3/10:
Train Loss: 1.8370 | Val Loss: 1.7979


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.14it/s]


Epoch 4/10:
Train Loss: 1.7205 | Val Loss: 1.7128


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.43it/s]


Epoch 5/10:
Train Loss: 1.6204 | Val Loss: 1.6119


Training: 100%|██████████| 250/250 [00:44<00:00,  5.66it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.31it/s]


Epoch 6/10:
Train Loss: 1.5314 | Val Loss: 1.4895


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.40it/s]


Epoch 7/10:
Train Loss: 1.4535 | Val Loss: 1.4966


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 8/10:
Train Loss: 1.3919 | Val Loss: 1.3912


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.37it/s]


Epoch 9/10:
Train Loss: 1.3283 | Val Loss: 1.3368


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.46it/s]
[I 2025-05-09 22:25:56,427] Trial 24 finished with value: 1.2928325524405828 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.16420589291897725, 'learning_rate': 0.0001683224534432734, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.2743 | Val Loss: 1.2928


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.64it/s]


Epoch 1/10:
Train Loss: 3.0098 | Val Loss: 2.9780


Training: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.67it/s]


Epoch 2/10:
Train Loss: 2.9766 | Val Loss: 3.0951


Training: 100%|██████████| 250/250 [00:48<00:00,  5.20it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.69it/s]


Epoch 3/10:
Train Loss: 2.9428 | Val Loss: 3.5507


Training: 100%|██████████| 250/250 [00:47<00:00,  5.21it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.68it/s]
[I 2025-05-09 22:29:22,759] Trial 25 finished with value: 2.9780085654485795 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.12390285685816065, 'learning_rate': 0.000632737840928002, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 4/10:
Train Loss: 2.9223 | Val Loss: 3.8999
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.40it/s]


Epoch 1/10:
Train Loss: 2.3506 | Val Loss: 2.0276


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.36it/s]


Epoch 2/10:
Train Loss: 1.9356 | Val Loss: 1.8073


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.42it/s]


Epoch 3/10:
Train Loss: 1.7573 | Val Loss: 1.6867


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.38it/s]


Epoch 4/10:
Train Loss: 1.6407 | Val Loss: 1.5947


Training: 100%|██████████| 250/250 [00:44<00:00,  5.65it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 5/10:
Train Loss: 1.5541 | Val Loss: 1.5340


Training: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 6/10:
Train Loss: 1.4775 | Val Loss: 1.4577


Training: 100%|██████████| 250/250 [00:43<00:00,  5.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.48it/s]


Epoch 7/10:
Train Loss: 1.4090 | Val Loss: 1.4656


Training: 100%|██████████| 250/250 [00:43<00:00,  5.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]


Epoch 8/10:
Train Loss: 1.3483 | Val Loss: 1.3461


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 9/10:
Train Loss: 1.2837 | Val Loss: 1.3427


Training: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.38it/s]
[I 2025-05-09 22:37:15,751] Trial 26 finished with value: 1.2691962510820418 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.19007556168682982, 'learning_rate': 0.00027923207081147216, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.2248 | Val Loss: 1.2692


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 21.00it/s]


Epoch 1/10:
Train Loss: 2.3896 | Val Loss: 2.1260


Training: 100%|██████████| 250/250 [00:41<00:00,  6.06it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.04it/s]


Epoch 2/10:
Train Loss: 2.0302 | Val Loss: 1.9308


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.05it/s]


Epoch 3/10:
Train Loss: 1.8810 | Val Loss: 1.8032


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.02it/s]


Epoch 4/10:
Train Loss: 1.7508 | Val Loss: 1.6592


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.96it/s]


Epoch 5/10:
Train Loss: 1.6467 | Val Loss: 1.5834


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 20.96it/s]


Epoch 6/10:
Train Loss: 1.5628 | Val Loss: 1.5133


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.08it/s]


Epoch 7/10:
Train Loss: 1.4956 | Val Loss: 1.4785


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.01it/s]


Epoch 8/10:
Train Loss: 1.4323 | Val Loss: 1.3957


Training: 100%|██████████| 250/250 [00:41<00:00,  6.04it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.01it/s]


Epoch 9/10:
Train Loss: 1.3767 | Val Loss: 1.3531


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.05it/s]
[I 2025-05-09 22:44:38,057] Trial 27 finished with value: 1.3094240616238306 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.14836288480289989, 'learning_rate': 0.00010454822671060152, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.3264 | Val Loss: 1.3094


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 1/10:
Train Loss: 3.0253 | Val Loss: 2.9838


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.61it/s]


Epoch 2/10:
Train Loss: 2.9834 | Val Loss: 2.9765


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.60it/s]


Epoch 3/10:
Train Loss: 2.9801 | Val Loss: 2.9752


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]


Epoch 4/10:
Train Loss: 2.9747 | Val Loss: 3.0225


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.60it/s]


Epoch 5/10:
Train Loss: 2.9635 | Val Loss: 3.4008


Training: 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]
Evaluating: 100%|██████████| 63/63 [00:08<00:00,  7.62it/s]
[I 2025-05-09 22:55:12,217] Trial 28 finished with value: 2.9751654465993247 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.22987782221180483, 'learning_rate': 0.001119529900835556, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 6/10:
Train Loss: 2.9316 | Val Loss: 3.5313
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.97it/s]


Epoch 1/10:
Train Loss: 2.3790 | Val Loss: 2.0813


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.98it/s]


Epoch 2/10:
Train Loss: 2.0083 | Val Loss: 1.8947


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.81it/s]


Epoch 3/10:
Train Loss: 1.8411 | Val Loss: 1.7422


Training: 100%|██████████| 250/250 [00:22<00:00, 11.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.08it/s]


Epoch 4/10:
Train Loss: 1.7121 | Val Loss: 1.6677


Training: 100%|██████████| 250/250 [00:22<00:00, 11.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.03it/s]


Epoch 5/10:
Train Loss: 1.6071 | Val Loss: 1.5473


Training: 100%|██████████| 250/250 [00:22<00:00, 11.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.89it/s]


Epoch 6/10:
Train Loss: 1.5274 | Val Loss: 1.4762


Training: 100%|██████████| 250/250 [00:22<00:00, 11.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.01it/s]


Epoch 7/10:
Train Loss: 1.4604 | Val Loss: 1.4293


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.92it/s]


Epoch 8/10:
Train Loss: 1.4000 | Val Loss: 1.3715


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.98it/s]


Epoch 9/10:
Train Loss: 1.3434 | Val Loss: 1.3283


Training: 100%|██████████| 250/250 [00:22<00:00, 11.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.13it/s]
[I 2025-05-09 22:59:15,091] Trial 29 finished with value: 1.2780068923556616 and parameters: {'d_model': 128, 'num_heads': 8, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.12008602756590794, 'learning_rate': 0.00034501658797260937, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.2975 | Val Loss: 1.2780


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.24it/s]


Epoch 1/10:
Train Loss: 2.2657 | Val Loss: 1.9273


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 2/10:
Train Loss: 1.8144 | Val Loss: 1.6920


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 3/10:
Train Loss: 1.6354 | Val Loss: 1.5733


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]


Epoch 4/10:
Train Loss: 1.5045 | Val Loss: 1.4554


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 5/10:
Train Loss: 1.3929 | Val Loss: 1.3985


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.17it/s]


Epoch 6/10:
Train Loss: 1.2952 | Val Loss: 1.3245


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 7/10:
Train Loss: 1.2169 | Val Loss: 1.2662


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.16it/s]


Epoch 8/10:
Train Loss: 1.1427 | Val Loss: 1.2126


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.19it/s]


Epoch 9/10:
Train Loss: 1.0736 | Val Loss: 1.2095


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.19it/s]
[I 2025-05-09 23:07:19,999] Trial 30 finished with value: 1.1728587462788536 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.1815806249987251, 'learning_rate': 0.0002407437861791138, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.0108 | Val Loss: 1.1729


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.23it/s]


Epoch 1/10:
Train Loss: 2.3030 | Val Loss: 1.9610


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 2/10:
Train Loss: 1.8123 | Val Loss: 1.7283


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 3/10:
Train Loss: 1.6364 | Val Loss: 1.6014


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.16it/s]


Epoch 4/10:
Train Loss: 1.5194 | Val Loss: 1.4842


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 5/10:
Train Loss: 1.4257 | Val Loss: 1.4482


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 6/10:
Train Loss: 1.3360 | Val Loss: 1.3571


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]


Epoch 7/10:
Train Loss: 1.2502 | Val Loss: 1.3152


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 8/10:
Train Loss: 1.1762 | Val Loss: 1.2615


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]


Epoch 9/10:
Train Loss: 1.1096 | Val Loss: 1.2043


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]
[I 2025-05-09 23:15:24,968] Trial 31 finished with value: 1.192972977956136 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.18390348954627433, 'learning_rate': 0.0002639533437056211, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.0463 | Val Loss: 1.1930


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 1/10:
Train Loss: 2.7529 | Val Loss: 2.1490


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 2/10:
Train Loss: 1.9779 | Val Loss: 1.8347


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 3/10:
Train Loss: 1.7568 | Val Loss: 1.6967


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 4/10:
Train Loss: 1.6251 | Val Loss: 1.5860


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 5/10:
Train Loss: 1.5245 | Val Loss: 1.5212


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.22it/s]


Epoch 6/10:
Train Loss: 1.4402 | Val Loss: 1.4628


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.11it/s]


Epoch 7/10:
Train Loss: 1.3652 | Val Loss: 1.4261


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]


Epoch 8/10:
Train Loss: 1.2987 | Val Loss: 1.4065


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.26it/s]


Epoch 9/10:
Train Loss: 1.2375 | Val Loss: 1.3663


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]
[I 2025-05-09 23:23:29,984] Trial 32 finished with value: 1.3529330927228171 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.15752834573539976, 'learning_rate': 0.0003770944817020234, 'batch_size': 32}. Best is trial 15 with value: 1.1693088421745905.


Epoch 10/10:
Train Loss: 1.1821 | Val Loss: 1.3529


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.22it/s]


Epoch 1/10:
Train Loss: 2.2391 | Val Loss: 1.9413


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.24it/s]


Epoch 2/10:
Train Loss: 1.7821 | Val Loss: 1.6230


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 3/10:
Train Loss: 1.5474 | Val Loss: 1.4947


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.23it/s]


Epoch 4/10:
Train Loss: 1.3633 | Val Loss: 1.2937


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.19it/s]


Epoch 5/10:
Train Loss: 1.2117 | Val Loss: 1.2131


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 6/10:
Train Loss: 1.0956 | Val Loss: 1.1150


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.14it/s]


Epoch 7/10:
Train Loss: 0.9964 | Val Loss: 1.0721


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 8/10:
Train Loss: 0.8965 | Val Loss: 1.0160


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]


Epoch 9/10:
Train Loss: 0.8044 | Val Loss: 0.9639


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 10/10:
Train Loss: 0.7022 | Val Loss: 0.9014


[I 2025-05-09 23:31:35,318] Trial 33 finished with value: 0.9014420206584628 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.13711053703933423, 'learning_rate': 0.00017560217719363183, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


New best model found! Val Loss: 0.9014
Config: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.13711053703933423, 'learning_rate': 0.00017560217719363183, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 1/10:
Train Loss: 2.2554 | Val Loss: 1.9688


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.22it/s]


Epoch 2/10:
Train Loss: 1.8346 | Val Loss: 1.6877


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]


Epoch 3/10:
Train Loss: 1.6217 | Val Loss: 1.5300


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 4/10:
Train Loss: 1.4599 | Val Loss: 1.3784


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]


Epoch 5/10:
Train Loss: 1.3231 | Val Loss: 1.2840


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 6/10:
Train Loss: 1.2089 | Val Loss: 1.1942


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.10it/s]


Epoch 7/10:
Train Loss: 1.1187 | Val Loss: 1.1341


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.16it/s]


Epoch 8/10:
Train Loss: 1.0324 | Val Loss: 1.0873


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 9/10:
Train Loss: 0.9466 | Val Loss: 1.0422


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.21it/s]
[I 2025-05-09 23:39:40,691] Trial 34 finished with value: 0.9562015656440978 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.18584884477888775, 'learning_rate': 0.0001604814004368108, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 0.8565 | Val Loss: 0.9562


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.20it/s]


Epoch 1/10:
Train Loss: 2.2755 | Val Loss: 1.9639


Training: 100%|██████████| 250/250 [00:44<00:00,  5.60it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.15it/s]


Epoch 2/10:
Train Loss: 1.8281 | Val Loss: 1.6866


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.18it/s]


Epoch 3/10:
Train Loss: 1.6163 | Val Loss: 1.5453


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.14it/s]


Epoch 4/10:
Train Loss: 1.4581 | Val Loss: 1.4279


Training: 100%|██████████| 250/250 [00:44<00:00,  5.61it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.23it/s]


Epoch 5/10:
Train Loss: 1.3311 | Val Loss: 1.2913


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.29it/s]


Epoch 6/10:
Train Loss: 1.2143 | Val Loss: 1.2509


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.28it/s]


Epoch 7/10:
Train Loss: 1.1204 | Val Loss: 1.1665


Training: 100%|██████████| 250/250 [00:44<00:00,  5.63it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.28it/s]


Epoch 8/10:
Train Loss: 1.0326 | Val Loss: 1.0911


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.32it/s]


Epoch 9/10:
Train Loss: 0.9525 | Val Loss: 1.0517


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.27it/s]
[I 2025-05-09 23:47:45,159] Trial 35 finished with value: 1.0138395135364835 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.17678084956105455, 'learning_rate': 0.00016004797991995426, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 0.8771 | Val Loss: 1.0138


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.22it/s]


Epoch 1/10:
Train Loss: 2.2812 | Val Loss: 1.9981


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.24it/s]


Epoch 2/10:
Train Loss: 1.8666 | Val Loss: 1.7745


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.29it/s]


Epoch 3/10:
Train Loss: 1.6654 | Val Loss: 1.6081


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.30it/s]


Epoch 4/10:
Train Loss: 1.5253 | Val Loss: 1.4722


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.30it/s]


Epoch 5/10:
Train Loss: 1.4046 | Val Loss: 1.3689


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.25it/s]


Epoch 6/10:
Train Loss: 1.3019 | Val Loss: 1.2809


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.31it/s]


Epoch 7/10:
Train Loss: 1.2189 | Val Loss: 1.2048


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.33it/s]


Epoch 8/10:
Train Loss: 1.1389 | Val Loss: 1.1802


Training: 100%|██████████| 250/250 [00:44<00:00,  5.63it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.34it/s]


Epoch 9/10:
Train Loss: 1.0706 | Val Loss: 1.1103


Training: 100%|██████████| 250/250 [00:44<00:00,  5.62it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.35it/s]
[I 2025-05-09 23:55:48,973] Trial 36 finished with value: 1.0918301287151517 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2157561425634204, 'learning_rate': 0.0001502838218185659, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.0065 | Val Loss: 1.0918


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.59it/s]


Epoch 1/10:
Train Loss: 2.2846 | Val Loss: 2.0045


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.56it/s]


Epoch 2/10:
Train Loss: 1.8856 | Val Loss: 1.7863


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.64it/s]


Epoch 3/10:
Train Loss: 1.6949 | Val Loss: 1.6231


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.55it/s]


Epoch 4/10:
Train Loss: 1.5462 | Val Loss: 1.4676


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.50it/s]


Epoch 5/10:
Train Loss: 1.4279 | Val Loss: 1.3821


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.64it/s]


Epoch 6/10:
Train Loss: 1.3331 | Val Loss: 1.2846


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 7/10:
Train Loss: 1.2557 | Val Loss: 1.2122


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.66it/s]


Epoch 8/10:
Train Loss: 1.1860 | Val Loss: 1.1466


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.62it/s]


Epoch 9/10:
Train Loss: 1.1061 | Val Loss: 1.0792


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.59it/s]
[I 2025-05-10 00:02:33,938] Trial 37 finished with value: 1.0078276026816595 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.22418851382911426, 'learning_rate': 0.00013619888089681968, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.0226 | Val Loss: 1.0078


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 1/10:
Train Loss: 2.3102 | Val Loss: 2.0147


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.67it/s]


Epoch 2/10:
Train Loss: 1.8933 | Val Loss: 1.7944


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.48it/s]


Epoch 3/10:
Train Loss: 1.7010 | Val Loss: 1.6409


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.66it/s]


Epoch 4/10:
Train Loss: 1.5457 | Val Loss: 1.4425


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.58it/s]


Epoch 5/10:
Train Loss: 1.4251 | Val Loss: 1.3464


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 6/10:
Train Loss: 1.3299 | Val Loss: 1.2657


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 7/10:
Train Loss: 1.2577 | Val Loss: 1.2064


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.60it/s]


Epoch 8/10:
Train Loss: 1.1905 | Val Loss: 1.1541


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.60it/s]


Epoch 9/10:
Train Loss: 1.1220 | Val Loss: 1.1080


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.64it/s]
[I 2025-05-10 00:09:18,866] Trial 38 finished with value: 1.0069148663490537 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.27253334982437305, 'learning_rate': 0.00018077810254088286, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.0474 | Val Loss: 1.0069


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 1/10:
Train Loss: 2.3279 | Val Loss: 2.1103


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.65it/s]


Epoch 2/10:
Train Loss: 1.9598 | Val Loss: 1.8750


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.64it/s]


Epoch 3/10:
Train Loss: 1.7963 | Val Loss: 1.7448


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 4/10:
Train Loss: 1.6806 | Val Loss: 1.6699


Training: 100%|██████████| 250/250 [00:37<00:00,  6.72it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]


Epoch 5/10:
Train Loss: 1.5841 | Val Loss: 1.5309


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.58it/s]


Epoch 6/10:
Train Loss: 1.5006 | Val Loss: 1.4266


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]


Epoch 7/10:
Train Loss: 1.4299 | Val Loss: 1.4001


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 8/10:
Train Loss: 1.3687 | Val Loss: 1.3113


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 9/10:
Train Loss: 1.3107 | Val Loss: 1.2877


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]
[I 2025-05-10 00:16:03,930] Trial 39 finished with value: 1.223323560896374 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.30490068613776244, 'learning_rate': 0.00013344959679204214, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.2599 | Val Loss: 1.2233


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.62it/s]


Epoch 1/10:
Train Loss: 3.0395 | Val Loss: 2.9875


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 2/10:
Train Loss: 2.9823 | Val Loss: 2.9801


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.39it/s]


Epoch 3/10:
Train Loss: 2.9796 | Val Loss: 2.9796


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.67it/s]


Epoch 4/10:
Train Loss: 2.9782 | Val Loss: 2.9762


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.59it/s]


Epoch 5/10:
Train Loss: 2.9767 | Val Loss: 2.9736


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.56it/s]


Epoch 6/10:
Train Loss: 2.9755 | Val Loss: 2.9788


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 7/10:
Train Loss: 2.9748 | Val Loss: 3.4781


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.58it/s]
[I 2025-05-10 00:21:28,488] Trial 40 finished with value: 2.9735644620562356 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.27543305658840744, 'learning_rate': 0.0025973753636268806, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 8/10:
Train Loss: 2.9714 | Val Loss: 3.3987
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 1/10:
Train Loss: 2.2908 | Val Loss: 2.0635


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 2/10:
Train Loss: 1.8778 | Val Loss: 1.8626


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.52it/s]


Epoch 3/10:
Train Loss: 1.6935 | Val Loss: 1.6534


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 4/10:
Train Loss: 1.5399 | Val Loss: 1.4341


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]


Epoch 5/10:
Train Loss: 1.4111 | Val Loss: 1.3303


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.56it/s]


Epoch 6/10:
Train Loss: 1.3207 | Val Loss: 1.2744


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 7/10:
Train Loss: 1.2442 | Val Loss: 1.2043


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.64it/s]


Epoch 8/10:
Train Loss: 1.1734 | Val Loss: 1.1385


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 9/10:
Train Loss: 1.0892 | Val Loss: 1.0407


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]
[I 2025-05-10 00:28:14,029] Trial 41 finished with value: 0.9887602991527982 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.238214458945188, 'learning_rate': 0.00015990447674345574, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 0.9979 | Val Loss: 0.9888


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.55it/s]


Epoch 1/10:
Train Loss: 2.2830 | Val Loss: 1.9824


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 2/10:
Train Loss: 1.8573 | Val Loss: 1.7903


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 3/10:
Train Loss: 1.6618 | Val Loss: 1.5659


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.60it/s]


Epoch 4/10:
Train Loss: 1.5053 | Val Loss: 1.4586


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.50it/s]


Epoch 5/10:
Train Loss: 1.3836 | Val Loss: 1.3006


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 6/10:
Train Loss: 1.2852 | Val Loss: 1.2276


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.63it/s]


Epoch 7/10:
Train Loss: 1.2043 | Val Loss: 1.1526


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.42it/s]


Epoch 8/10:
Train Loss: 1.1288 | Val Loss: 1.0881


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.43it/s]


Epoch 9/10:
Train Loss: 1.0430 | Val Loss: 1.0135


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.57it/s]
[I 2025-05-10 00:34:59,586] Trial 42 finished with value: 0.9359555831031193 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2370447727845632, 'learning_rate': 0.0001863065759028075, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 0.9518 | Val Loss: 0.9360


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.50it/s]


Epoch 1/10:
Train Loss: 2.2978 | Val Loss: 2.0316


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.64it/s]


Epoch 2/10:
Train Loss: 1.8788 | Val Loss: 1.8064


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 3/10:
Train Loss: 1.6904 | Val Loss: 1.6117


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.52it/s]


Epoch 4/10:
Train Loss: 1.5529 | Val Loss: 1.5086


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 5/10:
Train Loss: 1.4362 | Val Loss: 1.4163


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 6/10:
Train Loss: 1.3381 | Val Loss: 1.2932


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 7/10:
Train Loss: 1.2577 | Val Loss: 1.2384


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.57it/s]


Epoch 8/10:
Train Loss: 1.1846 | Val Loss: 1.1522


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.42it/s]


Epoch 9/10:
Train Loss: 1.1149 | Val Loss: 1.1027


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.62it/s]
[I 2025-05-10 00:41:45,062] Trial 43 finished with value: 1.0220974295858354 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.24020926786323593, 'learning_rate': 0.00017209915866942378, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.0408 | Val Loss: 1.0221


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.65it/s]


Epoch 1/10:
Train Loss: 2.3782 | Val Loss: 2.1709


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 2/10:
Train Loss: 2.0114 | Val Loss: 1.9498


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.56it/s]


Epoch 3/10:
Train Loss: 1.8527 | Val Loss: 1.7441


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 4/10:
Train Loss: 1.7416 | Val Loss: 1.7458


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.69it/s]


Epoch 5/10:
Train Loss: 1.6499 | Val Loss: 1.5936


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.52it/s]


Epoch 6/10:
Train Loss: 1.5713 | Val Loss: 1.5084


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.52it/s]


Epoch 7/10:
Train Loss: 1.4978 | Val Loss: 1.4692


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.48it/s]


Epoch 8/10:
Train Loss: 1.4349 | Val Loss: 1.3710


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.56it/s]


Epoch 9/10:
Train Loss: 1.3795 | Val Loss: 1.3336


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.58it/s]
[I 2025-05-10 00:48:30,716] Trial 44 finished with value: 1.2821196109529525 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.3922596878402993, 'learning_rate': 0.0001828757828240899, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.3323 | Val Loss: 1.2821


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.48it/s]


Epoch 1/10:
Train Loss: 2.3632 | Val Loss: 2.1199


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 2/10:
Train Loss: 2.0034 | Val Loss: 1.9214


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.62it/s]


Epoch 3/10:
Train Loss: 1.8495 | Val Loss: 1.7850


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.54it/s]


Epoch 4/10:
Train Loss: 1.7320 | Val Loss: 1.6523


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.50it/s]


Epoch 5/10:
Train Loss: 1.6356 | Val Loss: 1.5688


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 6/10:
Train Loss: 1.5496 | Val Loss: 1.4729


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.56it/s]


Epoch 7/10:
Train Loss: 1.4706 | Val Loss: 1.3930


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 8/10:
Train Loss: 1.4060 | Val Loss: 1.3333


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 9/10:
Train Loss: 1.3490 | Val Loss: 1.2882


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]
[I 2025-05-10 00:55:16,224] Trial 45 finished with value: 1.271683276645721 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.3012621466855819, 'learning_rate': 0.00010367359342785526, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.3038 | Val Loss: 1.2717


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 1/10:
Train Loss: 2.3104 | Val Loss: 2.0607


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.63it/s]


Epoch 2/10:
Train Loss: 1.9324 | Val Loss: 1.8354


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 3/10:
Train Loss: 1.7601 | Val Loss: 1.6550


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 4/10:
Train Loss: 1.6294 | Val Loss: 1.5566


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.57it/s]


Epoch 5/10:
Train Loss: 1.5260 | Val Loss: 1.5021


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 6/10:
Train Loss: 1.4372 | Val Loss: 1.3635


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.45it/s]


Epoch 7/10:
Train Loss: 1.3629 | Val Loss: 1.3012


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.55it/s]


Epoch 8/10:
Train Loss: 1.2955 | Val Loss: 1.2388


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.46it/s]


Epoch 9/10:
Train Loss: 1.2313 | Val Loss: 1.1889


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]
[I 2025-05-10 01:02:01,694] Trial 46 finished with value: 1.1446277461354695 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.26885407874373374, 'learning_rate': 0.0001274040597221875, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.1698 | Val Loss: 1.1446


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 1/10:
Train Loss: 2.2803 | Val Loss: 1.9528


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.62it/s]


Epoch 2/10:
Train Loss: 1.8595 | Val Loss: 1.6990


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.61it/s]


Epoch 3/10:
Train Loss: 1.6676 | Val Loss: 1.6583


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.52it/s]


Epoch 4/10:
Train Loss: 1.5387 | Val Loss: 1.5310


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]


Epoch 5/10:
Train Loss: 1.4294 | Val Loss: 1.3848


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]


Epoch 6/10:
Train Loss: 1.3320 | Val Loss: 1.3086


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.52it/s]


Epoch 7/10:
Train Loss: 1.2427 | Val Loss: 1.2182


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.60it/s]


Epoch 8/10:
Train Loss: 1.1661 | Val Loss: 1.1518


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.44it/s]


Epoch 9/10:
Train Loss: 1.0994 | Val Loss: 1.0967


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.56it/s]
[I 2025-05-10 01:08:47,162] Trial 47 finished with value: 1.0387944503436013 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.20045514707549628, 'learning_rate': 0.00019535922185504246, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.0330 | Val Loss: 1.0388


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 1/10:
Train Loss: 2.3001 | Val Loss: 2.0206


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.47it/s]


Epoch 2/10:
Train Loss: 1.9061 | Val Loss: 1.8272


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.46it/s]


Epoch 3/10:
Train Loss: 1.7180 | Val Loss: 1.6938


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.52it/s]


Epoch 4/10:
Train Loss: 1.5933 | Val Loss: 1.5031


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.55it/s]


Epoch 5/10:
Train Loss: 1.4861 | Val Loss: 1.4018


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.57it/s]


Epoch 6/10:
Train Loss: 1.3905 | Val Loss: 1.3427


Training: 100%|██████████| 250/250 [00:37<00:00,  6.71it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.49it/s]


Epoch 7/10:
Train Loss: 1.3081 | Val Loss: 1.2865


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 8/10:
Train Loss: 1.2449 | Val Loss: 1.1908


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.41it/s]


Epoch 9/10:
Train Loss: 1.1814 | Val Loss: 1.1537


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.41it/s]
[I 2025-05-10 01:15:33,016] Trial 48 finished with value: 1.1034058587891715 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2654632783956562, 'learning_rate': 0.0001970408621579362, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.1177 | Val Loss: 1.1034


Training: 100%|██████████| 250/250 [00:37<00:00,  6.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.55it/s]


Epoch 1/10:
Train Loss: 2.3358 | Val Loss: 2.1088


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.50it/s]


Epoch 2/10:
Train Loss: 1.9489 | Val Loss: 1.8582


Training: 100%|██████████| 250/250 [00:37<00:00,  6.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.51it/s]


Epoch 3/10:
Train Loss: 1.7757 | Val Loss: 1.7026


Training: 100%|██████████| 250/250 [00:37<00:00,  6.69it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.43it/s]


Epoch 4/10:
Train Loss: 1.6446 | Val Loss: 1.5687


Training: 100%|██████████| 250/250 [00:37<00:00,  6.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.43it/s]


Epoch 5/10:
Train Loss: 1.5358 | Val Loss: 1.4476


Training: 100%|██████████| 250/250 [00:37<00:00,  6.68it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.38it/s]


Epoch 6/10:
Train Loss: 1.4527 | Val Loss: 1.3940


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 7/10:
Train Loss: 1.3809 | Val Loss: 1.3242


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.53it/s]


Epoch 8/10:
Train Loss: 1.3245 | Val Loss: 1.2772


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.50it/s]


Epoch 9/10:
Train Loss: 1.2658 | Val Loss: 1.2286


Training: 100%|██████████| 250/250 [00:37<00:00,  6.70it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 19.64it/s]
[I 2025-05-10 01:22:19,316] Trial 49 finished with value: 1.191410089296008 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.24283791924154555, 'learning_rate': 0.00010148997037885089, 'batch_size': 32}. Best is trial 33 with value: 0.9014420206584628.


Epoch 10/10:
Train Loss: 1.2151 | Val Loss: 1.1914

Best trial:
  Validation Loss: 0.9014
  Params: 
    d_model: 512
    num_heads: 8
    num_layers: 8
    d_ff: 256
    dropout: 0.13711053703933423
    learning_rate: 0.00017560217719363183
    batch_size: 32


Evaluating: 100%|██████████| 250/250 [00:15<00:00, 16.20it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.28it/s]



Final Evaluation:
Train Loss: 0.4462 | Val Loss: 0.9014
Train Accuracy: 0.8643 | Val Accuracy: 0.7339

Test Decryptions:
Input: 'Please decrypt the following using Caesar cipher: gfbs' | Output: 'PLETS THATERE FOR HERSHELFF THEMSE TOOOK FROM THE TO.' | Expected: 'fear' | ✗
Input: 'Please decrypt the following using Caesar cipher: dpnqvufs' | Output: 'LEONED DOUBTS ARED THRES FOR FOR ME HEIRS FROM THE TOOD.' | Expected: 'computer' | ✗
Input: 'Please decrypt the following using Caesar cipher:xibu' | Output: 'PLETS THATERE FOR HERSELF TO FIRSM THE OUT WORKING.' | Expected: 'what' | ✗
