In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Preparation

def load_data(file_path):
    df = pd.read_excel(file_path)

    # Filter rows where 'Output' length is <=500
    df = df[df['output'].str.len() <= 200]

    # Get the count of such rows
    count_filtered = len(df)

    # Randomly select 250,000 rows (if available)
    if count_filtered > 10000:
        df = df.sample(n=10000, random_state=42)

    inputs = df['input'].tolist()
    outputs = df['output'].tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                break
    return best_val_loss  # Return the best validation loss from this training run

# Global variables to track best model across all trials
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

# Hyperparameter Optimization with Optuna
def objective(trial):
    global best_overall_model, best_overall_loss, best_config

    config = {
        "d_model": trial.suggest_categorical("d_model", [128, 256, 512]),
        "num_heads": trial.suggest_categorical("num_heads", [2, 4, 8, 16]),
        "num_layers": trial.suggest_categorical("num_layers", [6, 8, 10, 12]),
        "d_ff": trial.suggest_categorical("d_ff", [256, 512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32]),
    }

    # Create data loaders with current batch size
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        num_layers=config["num_layers"],
        d_ff=config["d_ff"],
        max_seq_length=max_length,
        dropout=config["dropout"]
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)

    # Train and get best validation loss for this configuration
    current_val_loss = train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, device, epochs=10)

    # Update overall best model if this one is better
    if current_val_loss < best_overall_loss:
        best_overall_loss = current_val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_vig_key_5.pth')
        print(f"New best model found! Val Loss: {current_val_loss:.4f}")
        print(f"Config: {config}")

    return current_val_loss

# Decryption Function
def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Execution
if __name__ == "__main__":
    # Load and prepare data
    inputs, outputs = load_data('/content/Full_training_vigenere_5.xlsx')
    vocab = Vocabulary()
    max_length = 256  # Adjusted for longer sentences

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Run hyperparameter optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100)  # 20 trials or 1 hour

    print("\nBest trial:")
    trial = study.best_trial
    print(f"  Validation Loss: {trial.value:.4f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Load the best model found during the search
    final_model = Transformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)
    final_model.load_state_dict(torch.load('/content/drive/MyDrive/best_vig_key_5.pth'))

    # Evaluate on full datasets
    full_train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=False)
    full_val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"], shuffle=False)

    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)

    train_loss = evaluate(final_model, full_train_loader, criterion, device)
    val_loss = evaluate(final_model, full_val_loader, criterion, device)

    train_acc = calculate_accuracy(final_model, full_train_loader, vocab, device)
    val_acc = calculate_accuracy(final_model, full_val_loader, vocab, device)

    print("\nFinal Evaluation:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_acc:.4f} | Val Accuracy: {val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher:xibu", "what")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(final_model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

Using device: cuda


[I 2025-05-09 14:45:42,123] A new study created in memory with name: no-name-b6d195b1-d9d4-4795-a645-f1ffb5993e76
Training: 100%|██████████| 250/250 [00:40<00:00,  6.22it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.08it/s]


Epoch 1/10:
Train Loss: 3.0130 | Val Loss: 2.9889


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.39it/s]


Epoch 2/10:
Train Loss: 2.9789 | Val Loss: 2.9818


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.23it/s]


Epoch 3/10:
Train Loss: 2.9774 | Val Loss: 2.9786


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.25it/s]


Epoch 4/10:
Train Loss: 2.9756 | Val Loss: 2.9865


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.27it/s]


Epoch 5/10:
Train Loss: 2.9729 | Val Loss: 3.2636


Training: 100%|██████████| 250/250 [00:40<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.24it/s]
[I 2025-05-09 14:50:04,469] Trial 0 finished with value: 2.9785834418402777 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2985336690576803, 'learning_rate': 0.0072355683655840825, 'batch_size': 32}. Best is trial 0 with value: 2.9785834418402777.


Epoch 6/10:
Train Loss: 2.9693 | Val Loss: 3.2806
Early stopping triggered!
New best model found! Val Loss: 2.9786
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2985336690576803, 'learning_rate': 0.0072355683655840825, 'batch_size': 32}


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.30it/s]


Epoch 1/10:
Train Loss: 3.0238 | Val Loss: 3.0116


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.30it/s]


Epoch 2/10:
Train Loss: 2.9810 | Val Loss: 2.9985


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.30it/s]


Epoch 3/10:
Train Loss: 2.9782 | Val Loss: 2.9820


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.28it/s]


Epoch 4/10:
Train Loss: 2.9769 | Val Loss: 2.9825


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.28it/s]


Epoch 5/10:
Train Loss: 2.9763 | Val Loss: 2.9744


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.29it/s]


Epoch 6/10:
Train Loss: 2.9750 | Val Loss: 2.9784


Training: 100%|██████████| 250/250 [01:55<00:00,  2.16it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.27it/s]


Epoch 7/10:
Train Loss: 2.9753 | Val Loss: 2.9739


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.26it/s]


Epoch 8/10:
Train Loss: 2.9745 | Val Loss: 2.9742


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.23it/s]


Epoch 9/10:
Train Loss: 2.9743 | Val Loss: 2.9750


Training: 100%|██████████| 250/250 [01:56<00:00,  2.15it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.27it/s]


Epoch 10/10:
Train Loss: 2.9737 | Val Loss: 2.9791
Early stopping triggered!


[I 2025-05-09 15:11:06,864] Trial 1 finished with value: 2.9739375795636858 and parameters: {'d_model': 512, 'num_heads': 2, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.3850891988674051, 'learning_rate': 0.006072345076087185, 'batch_size': 32}. Best is trial 1 with value: 2.9739375795636858.


New best model found! Val Loss: 2.9739
Config: {'d_model': 512, 'num_heads': 2, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.3850891988674051, 'learning_rate': 0.006072345076087185, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:24<00:00, 10.08it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.07it/s]


Epoch 1/10:
Train Loss: 3.0046 | Val Loss: 2.9808


Training: 100%|██████████| 250/250 [00:24<00:00, 10.09it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.88it/s]


Epoch 2/10:
Train Loss: 2.9678 | Val Loss: 3.4085


Training: 100%|██████████| 250/250 [00:24<00:00, 10.10it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.11it/s]


Epoch 3/10:
Train Loss: 2.9509 | Val Loss: 3.3615


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 28.01it/s]
[I 2025-05-09 15:12:55,306] Trial 2 finished with value: 2.9808408116537426 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.3642175080044162, 'learning_rate': 0.004624603403973622, 'batch_size': 32}. Best is trial 1 with value: 2.9739375795636858.


Epoch 4/10:
Train Loss: 2.9440 | Val Loss: 3.4545
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.62it/s]


Epoch 1/10:
Train Loss: 2.4229 | Val Loss: 2.1152


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 2/10:
Train Loss: 1.9846 | Val Loss: 1.9408


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 3/10:
Train Loss: 1.8236 | Val Loss: 1.7392


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 4/10:
Train Loss: 1.7221 | Val Loss: 1.7360


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.62it/s]


Epoch 5/10:
Train Loss: 1.6488 | Val Loss: 1.6344


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 6/10:
Train Loss: 1.5843 | Val Loss: 1.5724


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 7/10:
Train Loss: 1.5286 | Val Loss: 1.5147


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 8/10:
Train Loss: 1.4850 | Val Loss: 1.5506


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 9/10:
Train Loss: 1.4401 | Val Loss: 1.4695


Training: 100%|██████████| 250/250 [02:00<00:00,  2.07it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.61it/s]


Epoch 10/10:
Train Loss: 1.3977 | Val Loss: 1.4893


[I 2025-05-09 15:34:57,013] Trial 3 finished with value: 1.4695423235968939 and parameters: {'d_model': 512, 'num_heads': 4, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.2970574710772531, 'learning_rate': 0.00014509069197230404, 'batch_size': 32}. Best is trial 3 with value: 1.4695423235968939.


New best model found! Val Loss: 1.4695
Config: {'d_model': 512, 'num_heads': 4, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.2970574710772531, 'learning_rate': 0.00014509069197230404, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.27it/s]


Epoch 1/10:
Train Loss: 2.3801 | Val Loss: 2.0852


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.27it/s]


Epoch 2/10:
Train Loss: 2.0452 | Val Loss: 1.9293


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.28it/s]


Epoch 3/10:
Train Loss: 1.8803 | Val Loss: 1.7600


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.29it/s]


Epoch 4/10:
Train Loss: 1.7561 | Val Loss: 1.6743


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.27it/s]


Epoch 5/10:
Train Loss: 1.6574 | Val Loss: 1.5516


Training: 100%|██████████| 250/250 [00:58<00:00,  4.29it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.27it/s]


Epoch 6/10:
Train Loss: 1.5779 | Val Loss: 1.4833


Training: 100%|██████████| 250/250 [00:58<00:00,  4.29it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.28it/s]


Epoch 7/10:
Train Loss: 1.5104 | Val Loss: 1.4505


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.24it/s]


Epoch 8/10:
Train Loss: 1.4507 | Val Loss: 1.3675


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.29it/s]


Epoch 9/10:
Train Loss: 1.3975 | Val Loss: 1.3334


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.27it/s]
[I 2025-05-09 15:45:35,227] Trial 4 finished with value: 1.289927049288674 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.13932143009940012, 'learning_rate': 0.00010880601078545964, 'batch_size': 32}. Best is trial 4 with value: 1.289927049288674.


Epoch 10/10:
Train Loss: 1.3477 | Val Loss: 1.2899
New best model found! Val Loss: 1.2899
Config: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.13932143009940012, 'learning_rate': 0.00010880601078545964, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:35<00:00,  7.00it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.62it/s]


Epoch 1/10:
Train Loss: 2.6317 | Val Loss: 2.2209


Training: 100%|██████████| 250/250 [00:35<00:00,  6.97it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.67it/s]


Epoch 2/10:
Train Loss: 2.2064 | Val Loss: 2.0659


Training: 100%|██████████| 250/250 [00:35<00:00,  6.99it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.65it/s]


Epoch 3/10:
Train Loss: 2.0820 | Val Loss: 1.9684


Training: 100%|██████████| 250/250 [00:35<00:00,  7.01it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.67it/s]


Epoch 4/10:
Train Loss: 1.9872 | Val Loss: 1.8872


Training: 100%|██████████| 250/250 [00:35<00:00,  7.01it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.68it/s]


Epoch 5/10:
Train Loss: 1.9157 | Val Loss: 1.8167


Training: 100%|██████████| 250/250 [00:35<00:00,  7.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.71it/s]


Epoch 6/10:
Train Loss: 1.8543 | Val Loss: 1.7824


Training: 100%|██████████| 250/250 [00:35<00:00,  7.01it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.64it/s]


Epoch 7/10:
Train Loss: 1.8082 | Val Loss: 1.7196


Training: 100%|██████████| 250/250 [00:35<00:00,  7.01it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.66it/s]


Epoch 8/10:
Train Loss: 1.7662 | Val Loss: 1.6811


Training: 100%|██████████| 250/250 [00:35<00:00,  7.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.77it/s]


Epoch 9/10:
Train Loss: 1.7296 | Val Loss: 1.6620


Training: 100%|██████████| 250/250 [00:35<00:00,  6.98it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.67it/s]
[I 2025-05-09 15:52:06,241] Trial 5 finished with value: 1.6256149363896204 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 12, 'd_ff': 1024, 'dropout': 0.37187765097737, 'learning_rate': 0.0002853625435978964, 'batch_size': 32}. Best is trial 4 with value: 1.289927049288674.


Epoch 10/10:
Train Loss: 1.6970 | Val Loss: 1.6256


Training: 100%|██████████| 250/250 [00:43<00:00,  5.78it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.54it/s]


Epoch 1/10:
Train Loss: 2.4107 | Val Loss: 2.0537


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.68it/s]


Epoch 2/10:
Train Loss: 2.0211 | Val Loss: 1.8611


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.68it/s]


Epoch 3/10:
Train Loss: 1.8756 | Val Loss: 1.7493


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.65it/s]


Epoch 4/10:
Train Loss: 1.7723 | Val Loss: 1.7022


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.69it/s]


Epoch 5/10:
Train Loss: 1.6977 | Val Loss: 1.5884


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.69it/s]


Epoch 6/10:
Train Loss: 1.6414 | Val Loss: 1.5600


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.66it/s]


Epoch 7/10:
Train Loss: 1.5939 | Val Loss: 1.5654


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.67it/s]


Epoch 8/10:
Train Loss: 1.5539 | Val Loss: 1.5532


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.67it/s]


Epoch 9/10:
Train Loss: 1.5204 | Val Loss: 1.5195


Training: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 15.56it/s]
[I 2025-05-09 15:59:58,501] Trial 6 finished with value: 1.4938433435228136 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.3618900041026635, 'learning_rate': 0.0004713117025588348, 'batch_size': 32}. Best is trial 4 with value: 1.289927049288674.


Epoch 10/10:
Train Loss: 1.4867 | Val Loss: 1.4938


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 1/10:
Train Loss: 2.3388 | Val Loss: 2.0362


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.57it/s]


Epoch 2/10:
Train Loss: 1.9747 | Val Loss: 1.8427


Training: 100%|██████████| 250/250 [03:11<00:00,  1.30it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 3/10:
Train Loss: 1.7976 | Val Loss: 1.7513


Training: 100%|██████████| 250/250 [03:11<00:00,  1.30it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 4/10:
Train Loss: 1.6859 | Val Loss: 1.6524


Training: 100%|██████████| 250/250 [03:11<00:00,  1.30it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 5/10:
Train Loss: 1.5943 | Val Loss: 1.5396


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 6/10:
Train Loss: 1.5041 | Val Loss: 1.4699


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 7/10:
Train Loss: 1.4107 | Val Loss: 1.4189


Training: 100%|██████████| 250/250 [03:11<00:00,  1.30it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 8/10:
Train Loss: 1.3356 | Val Loss: 1.4109


Training: 100%|██████████| 250/250 [03:11<00:00,  1.30it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 9/10:
Train Loss: 1.2670 | Val Loss: 1.3478


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 10/10:
Train Loss: 1.2040 | Val Loss: 1.2552


[I 2025-05-09 16:34:52,040] Trial 7 finished with value: 1.2551617508842832 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.2641858235352921, 'learning_rate': 0.0001321414067707572, 'batch_size': 32}. Best is trial 7 with value: 1.2551617508842832.


New best model found! Val Loss: 1.2552
Config: {'d_model': 512, 'num_heads': 16, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.2641858235352921, 'learning_rate': 0.0001321414067707572, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:22<00:00, 11.20it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 31.39it/s]


Epoch 1/10:
Train Loss: 3.0015 | Val Loss: 2.9817


Training: 100%|██████████| 250/250 [00:22<00:00, 11.21it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 31.24it/s]


Epoch 2/10:
Train Loss: 2.9783 | Val Loss: 2.9898


Training: 100%|██████████| 250/250 [00:22<00:00, 11.26it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 31.45it/s]


Epoch 3/10:
Train Loss: 2.9733 | Val Loss: 3.0457


Training: 100%|██████████| 250/250 [00:22<00:00, 11.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 31.40it/s]
[I 2025-05-09 16:36:29,182] Trial 8 finished with value: 2.9816948194352406 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 10, 'd_ff': 256, 'dropout': 0.19285853149875684, 'learning_rate': 0.005974499692594361, 'batch_size': 32}. Best is trial 7 with value: 1.2551617508842832.


Epoch 4/10:
Train Loss: 2.9659 | Val Loss: 3.1724
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:24<00:00, 10.12it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.81it/s]


Epoch 1/10:
Train Loss: 3.0044 | Val Loss: 2.9775


Training: 100%|██████████| 250/250 [00:24<00:00, 10.10it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.79it/s]


Epoch 2/10:
Train Loss: 2.9620 | Val Loss: 3.2872


Training: 100%|██████████| 250/250 [00:24<00:00, 10.05it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.69it/s]


Epoch 3/10:
Train Loss: 2.9463 | Val Loss: 3.3312


Training: 100%|██████████| 250/250 [00:24<00:00, 10.04it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.92it/s]
[I 2025-05-09 16:38:17,599] Trial 9 finished with value: 2.9775026223016163 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.20439629656080416, 'learning_rate': 0.004156626997212533, 'batch_size': 32}. Best is trial 7 with value: 1.2551617508842832.


Epoch 4/10:
Train Loss: 2.9393 | Val Loss: 3.4120
Early stopping triggered!


Training: 100%|██████████| 250/250 [01:55<00:00,  2.17it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.94it/s]


Epoch 1/10:
Train Loss: 3.0258 | Val Loss: 2.9773


Training: 100%|██████████| 250/250 [01:55<00:00,  2.17it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.94it/s]


Epoch 2/10:
Train Loss: 2.9818 | Val Loss: 2.9774


Training: 100%|██████████| 250/250 [01:55<00:00,  2.17it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.93it/s]


Epoch 3/10:
Train Loss: 2.9762 | Val Loss: 3.2428


Training: 100%|██████████| 250/250 [01:55<00:00,  2.17it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  5.93it/s]
[I 2025-05-09 16:46:42,064] Trial 10 finished with value: 2.9773302608066134 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.25806366885186627, 'learning_rate': 0.001568408024726513, 'batch_size': 32}. Best is trial 7 with value: 1.2551617508842832.


Epoch 4/10:
Train Loss: 2.9702 | Val Loss: 3.8578
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.30it/s]


Epoch 1/10:
Train Loss: 2.4014 | Val Loss: 2.0888


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.28it/s]


Epoch 2/10:
Train Loss: 2.0447 | Val Loss: 1.9110


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.27it/s]


Epoch 3/10:
Train Loss: 1.8826 | Val Loss: 1.7618


Training: 100%|██████████| 250/250 [00:58<00:00,  4.29it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.29it/s]


Epoch 4/10:
Train Loss: 1.7525 | Val Loss: 1.6427


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.29it/s]


Epoch 5/10:
Train Loss: 1.6501 | Val Loss: 1.5634


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.27it/s]


Epoch 6/10:
Train Loss: 1.5710 | Val Loss: 1.4774


Training: 100%|██████████| 250/250 [00:58<00:00,  4.29it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.29it/s]


Epoch 7/10:
Train Loss: 1.5025 | Val Loss: 1.4257


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.28it/s]


Epoch 8/10:
Train Loss: 1.4452 | Val Loss: 1.3504


Training: 100%|██████████| 250/250 [00:58<00:00,  4.29it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.28it/s]


Epoch 9/10:
Train Loss: 1.3880 | Val Loss: 1.3204


Training: 100%|██████████| 250/250 [00:58<00:00,  4.30it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.29it/s]
[I 2025-05-09 16:57:20,020] Trial 11 finished with value: 1.278949009047614 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.12691118264691742, 'learning_rate': 0.00010373599734664252, 'batch_size': 32}. Best is trial 7 with value: 1.2551617508842832.


Epoch 10/10:
Train Loss: 1.3412 | Val Loss: 1.2789


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.40it/s]


Epoch 1/10:
Train Loss: 2.2171 | Val Loss: 1.8725


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.39it/s]


Epoch 2/10:
Train Loss: 1.7537 | Val Loss: 1.6283


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.41it/s]


Epoch 3/10:
Train Loss: 1.5671 | Val Loss: 1.4957


Training: 100%|██████████| 250/250 [02:04<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.39it/s]


Epoch 4/10:
Train Loss: 1.4028 | Val Loss: 1.3591


Training: 100%|██████████| 250/250 [02:04<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.39it/s]


Epoch 5/10:
Train Loss: 1.2638 | Val Loss: 1.2685


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.40it/s]


Epoch 6/10:
Train Loss: 1.1359 | Val Loss: 1.2067


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.39it/s]


Epoch 7/10:
Train Loss: 1.0303 | Val Loss: 1.1103


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.40it/s]


Epoch 8/10:
Train Loss: 0.9295 | Val Loss: 1.1128


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.40it/s]


Epoch 9/10:
Train Loss: 0.8350 | Val Loss: 1.0476


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.40it/s]


Epoch 10/10:
Train Loss: 0.7479 | Val Loss: 1.0180


[I 2025-05-09 17:19:57,948] Trial 12 finished with value: 1.0180437791915167 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.10030801105525178, 'learning_rate': 0.00026007534143182576, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


New best model found! Val Loss: 1.0180
Config: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.10030801105525178, 'learning_rate': 0.00026007534143182576, 'batch_size': 32}


Training: 100%|██████████| 250/250 [03:24<00:00,  1.22it/s]
Evaluating: 100%|██████████| 63/63 [00:18<00:00,  3.32it/s]


Epoch 1/10:
Train Loss: 3.0175 | Val Loss: 2.9812


Training: 100%|██████████| 250/250 [03:24<00:00,  1.22it/s]
Evaluating: 100%|██████████| 63/63 [00:18<00:00,  3.33it/s]


Epoch 2/10:
Train Loss: 2.9826 | Val Loss: 3.1185


Training: 100%|██████████| 250/250 [03:24<00:00,  1.22it/s]
Evaluating: 100%|██████████| 63/63 [00:18<00:00,  3.32it/s]


Epoch 3/10:
Train Loss: 2.9438 | Val Loss: 4.0674


Training: 100%|██████████| 250/250 [03:24<00:00,  1.22it/s]
Evaluating: 100%|██████████| 63/63 [00:18<00:00,  3.33it/s]
[I 2025-05-09 17:34:54,006] Trial 13 finished with value: 2.9812444353860523 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.19649105213088616, 'learning_rate': 0.0003473577300140061, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 4/10:
Train Loss: 2.9074 | Val Loss: 4.1467
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.50it/s]


Epoch 1/10:
Train Loss: 3.0271 | Val Loss: 2.9832


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.49it/s]


Epoch 2/10:
Train Loss: 2.9819 | Val Loss: 2.9784


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.50it/s]


Epoch 3/10:
Train Loss: 2.9567 | Val Loss: 3.5073


Training: 100%|██████████| 250/250 [02:04<00:00,  2.01it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.50it/s]


Epoch 4/10:
Train Loss: 2.9198 | Val Loss: 3.7055


Training: 100%|██████████| 250/250 [02:04<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.49it/s]
[I 2025-05-09 17:46:12,732] Trial 14 finished with value: 2.978371446094816 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.2523050416232123, 'learning_rate': 0.0009412132111082535, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 5/10:
Train Loss: 2.9051 | Val Loss: 3.7464
Early stopping triggered!


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.57it/s]


Epoch 1/10:
Train Loss: 2.3956 | Val Loss: 1.9659


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.57it/s]


Epoch 2/10:
Train Loss: 1.8537 | Val Loss: 1.7336


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.57it/s]


Epoch 3/10:
Train Loss: 1.6645 | Val Loss: 1.5953


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.57it/s]


Epoch 4/10:
Train Loss: 1.5369 | Val Loss: 1.4937


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 5/10:
Train Loss: 1.4435 | Val Loss: 1.4790


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 6/10:
Train Loss: 1.3693 | Val Loss: 1.3930


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 7/10:
Train Loss: 1.3000 | Val Loss: 1.3653


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.57it/s]


Epoch 8/10:
Train Loss: 1.2129 | Val Loss: 1.3311


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.56it/s]


Epoch 9/10:
Train Loss: 1.1294 | Val Loss: 1.2600


Training: 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]
Evaluating: 100%|██████████| 63/63 [00:17<00:00,  3.57it/s]
[I 2025-05-09 18:21:04,515] Trial 15 finished with value: 1.2412243220541213 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.15668940316955382, 'learning_rate': 0.0001966709385024031, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 10/10:
Train Loss: 1.0521 | Val Loss: 1.2412


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.46it/s]


Epoch 1/10:
Train Loss: 3.0171 | Val Loss: 2.9773


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.48it/s]


Epoch 2/10:
Train Loss: 2.9840 | Val Loss: 2.9782


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.48it/s]


Epoch 3/10:
Train Loss: 2.9666 | Val Loss: 3.0960


Training: 100%|██████████| 250/250 [02:03<00:00,  2.02it/s]
Evaluating: 100%|██████████| 63/63 [00:11<00:00,  5.49it/s]
[I 2025-05-09 18:30:05,954] Trial 16 finished with value: 2.9773171553536066 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.1014211844840098, 'learning_rate': 0.0007072302789451045, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 4/10:
Train Loss: 2.9300 | Val Loss: 3.4922
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s]


Epoch 1/10:
Train Loss: 2.2808 | Val Loss: 1.9054


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s]


Epoch 2/10:
Train Loss: 1.8050 | Val Loss: 1.6733


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s]


Epoch 3/10:
Train Loss: 1.6265 | Val Loss: 1.5780


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.44it/s]


Epoch 4/10:
Train Loss: 1.5091 | Val Loss: 1.4598


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s]


Epoch 5/10:
Train Loss: 1.3856 | Val Loss: 1.3749


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s]


Epoch 6/10:
Train Loss: 1.2792 | Val Loss: 1.3130


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.44it/s]


Epoch 7/10:
Train Loss: 1.1828 | Val Loss: 1.2744


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s]


Epoch 8/10:
Train Loss: 1.0976 | Val Loss: 1.1935


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.44it/s]


Epoch 9/10:
Train Loss: 1.0140 | Val Loss: 1.1494


Training: 100%|██████████| 250/250 [02:33<00:00,  1.63it/s]
Evaluating: 100%|██████████| 63/63 [00:14<00:00,  4.44it/s]
[I 2025-05-09 18:58:02,957] Trial 17 finished with value: 1.1494163937038846 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 512, 'dropout': 0.15655948034968054, 'learning_rate': 0.0002333877894066981, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 10/10:
Train Loss: 0.9401 | Val Loss: 1.1641


Training: 100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
Evaluating: 100%|██████████| 63/63 [00:15<00:00,  4.16it/s]


Epoch 1/10:
Train Loss: 3.0278 | Val Loss: 2.9796


Training: 100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
Evaluating: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]


Epoch 2/10:
Train Loss: 2.9815 | Val Loss: 2.9776


Training: 100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
Evaluating: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]


Epoch 3/10:
Train Loss: 2.9781 | Val Loss: 2.9762


Training: 100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
Evaluating: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]


Epoch 4/10:
Train Loss: 2.9765 | Val Loss: 2.9749


Training: 100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
Evaluating: 100%|██████████| 63/63 [00:15<00:00,  4.18it/s]


Epoch 5/10:
Train Loss: 2.9756 | Val Loss: 2.9772


Training: 100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
Evaluating: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]


Epoch 6/10:
Train Loss: 2.9749 | Val Loss: 2.9793


Training: 100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
Evaluating: 100%|██████████| 63/63 [00:15<00:00,  4.17it/s]
[I 2025-05-09 19:19:03,423] Trial 18 finished with value: 2.9749180551559204 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 1024, 'dropout': 0.16776819139852284, 'learning_rate': 0.001812419492490132, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 7/10:
Train Loss: 2.9745 | Val Loss: 2.9763
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 1/10:
Train Loss: 2.2645 | Val Loss: 1.9007


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 2/10:
Train Loss: 1.7801 | Val Loss: 1.6535


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 3/10:
Train Loss: 1.5770 | Val Loss: 1.4868


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 4/10:
Train Loss: 1.3929 | Val Loss: 1.3661


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 5/10:
Train Loss: 1.2503 | Val Loss: 1.2583


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]


Epoch 6/10:
Train Loss: 1.1235 | Val Loss: 1.1899


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 7/10:
Train Loss: 1.0119 | Val Loss: 1.1338


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 8/10:
Train Loss: 0.9066 | Val Loss: 1.0629


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 9/10:
Train Loss: 0.8070 | Val Loss: 1.0580


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]
[I 2025-05-09 19:46:02,441] Trial 19 finished with value: 1.039818685206156 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.10423140827479238, 'learning_rate': 0.00024108636727324128, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 10/10:
Train Loss: 0.7132 | Val Loss: 1.0398


Training: 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]
Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.58it/s]


Epoch 1/10:
Train Loss: 3.0145 | Val Loss: 2.9841


Training: 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]
Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.56it/s]


Epoch 2/10:
Train Loss: 2.9847 | Val Loss: 2.9782


Training: 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]
Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.58it/s]


Epoch 3/10:
Train Loss: 2.9749 | Val Loss: 3.1181


Training: 100%|██████████| 250/250 [01:44<00:00,  2.39it/s]
Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.62it/s]


Epoch 4/10:
Train Loss: 2.9377 | Val Loss: 3.5107


Training: 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]
Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.62it/s]
[I 2025-05-09 19:55:34,814] Trial 20 finished with value: 2.9781672311207603 and parameters: {'d_model': 512, 'num_heads': 8, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.10601650281653552, 'learning_rate': 0.0004993405697877915, 'batch_size': 32}. Best is trial 12 with value: 1.0180437791915167.


Epoch 5/10:
Train Loss: 2.9185 | Val Loss: 3.7388
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 1/10:
Train Loss: 2.2699 | Val Loss: 1.9037


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 2/10:
Train Loss: 1.8010 | Val Loss: 1.6303


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 3/10:
Train Loss: 1.5605 | Val Loss: 1.4471


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 4/10:
Train Loss: 1.3830 | Val Loss: 1.3368


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 5/10:
Train Loss: 1.2362 | Val Loss: 1.2316


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]


Epoch 6/10:
Train Loss: 1.1087 | Val Loss: 1.1418


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 7/10:
Train Loss: 1.0019 | Val Loss: 1.1156


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 8/10:
Train Loss: 0.8978 | Val Loss: 1.0627


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 9/10:
Train Loss: 0.8032 | Val Loss: 1.0466


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 10/10:
Train Loss: 0.7107 | Val Loss: 0.9926


[I 2025-05-09 20:22:34,038] Trial 21 finished with value: 0.9926467443269397 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.1293484716101056, 'learning_rate': 0.00022126741867569192, 'batch_size': 32}. Best is trial 21 with value: 0.9926467443269397.


New best model found! Val Loss: 0.9926
Config: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.1293484716101056, 'learning_rate': 0.00022126741867569192, 'batch_size': 32}


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.57it/s]


Epoch 1/10:
Train Loss: 3.0128 | Val Loss: 2.9447


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.57it/s]


Epoch 2/10:
Train Loss: 2.3718 | Val Loss: 2.1587


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.57it/s]


Epoch 3/10:
Train Loss: 2.0746 | Val Loss: 1.9790


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.58it/s]


Epoch 4/10:
Train Loss: 1.9256 | Val Loss: 1.9538


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.58it/s]


Epoch 5/10:
Train Loss: 1.8188 | Val Loss: 2.1332


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.58it/s]


Epoch 6/10:
Train Loss: 1.7326 | Val Loss: 2.5632


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.56it/s]
[I 2025-05-09 20:41:26,197] Trial 22 finished with value: 1.9538008863963778 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.1216078207826802, 'learning_rate': 0.0003984913321349525, 'batch_size': 32}. Best is trial 21 with value: 0.9926467443269397.


Epoch 7/10:
Train Loss: 1.6632 | Val Loss: 2.7562
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 1/10:
Train Loss: 2.2584 | Val Loss: 1.9294


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]


Epoch 2/10:
Train Loss: 1.7734 | Val Loss: 1.6103


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 3/10:
Train Loss: 1.5103 | Val Loss: 1.4038


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 4/10:
Train Loss: 1.3194 | Val Loss: 1.2793


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 5/10:
Train Loss: 1.1626 | Val Loss: 1.1564


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]


Epoch 6/10:
Train Loss: 1.0113 | Val Loss: 1.1001


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 7/10:
Train Loss: 0.8812 | Val Loss: 1.0034


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 8/10:
Train Loss: 0.7632 | Val Loss: 0.9571


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 9/10:
Train Loss: 0.6497 | Val Loss: 0.9087


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 10/10:
Train Loss: 0.5489 | Val Loss: 0.9114


[I 2025-05-09 21:08:25,184] Trial 23 finished with value: 0.908673295899043 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.10460709138060675, 'learning_rate': 0.00020651580640249428, 'batch_size': 32}. Best is trial 23 with value: 0.908673295899043.


New best model found! Val Loss: 0.9087
Config: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.10460709138060675, 'learning_rate': 0.00020651580640249428, 'batch_size': 32}


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 1/10:
Train Loss: 2.2991 | Val Loss: 1.9755


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 2/10:
Train Loss: 1.8776 | Val Loss: 1.7292


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 3/10:
Train Loss: 1.6634 | Val Loss: 1.5706


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 4/10:
Train Loss: 1.5154 | Val Loss: 1.4474


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 5/10:
Train Loss: 1.3994 | Val Loss: 1.3483


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 6/10:
Train Loss: 1.2958 | Val Loss: 1.2417


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 7/10:
Train Loss: 1.1964 | Val Loss: 1.2013


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 8/10:
Train Loss: 1.1083 | Val Loss: 1.1309


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 9/10:
Train Loss: 1.0293 | Val Loss: 1.0965


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]
[I 2025-05-09 21:35:24,464] Trial 24 finished with value: 1.0416769129889352 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.22361764867393255, 'learning_rate': 0.00015430430171156442, 'batch_size': 32}. Best is trial 23 with value: 0.908673295899043.


Epoch 10/10:
Train Loss: 0.9564 | Val Loss: 1.0417


Training: 100%|██████████| 250/250 [01:51<00:00,  2.25it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.05it/s]


Epoch 1/10:
Train Loss: 3.0177 | Val Loss: 2.9834


Training: 100%|██████████| 250/250 [01:51<00:00,  2.25it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.07it/s]


Epoch 2/10:
Train Loss: 2.9663 | Val Loss: 2.9777


Training: 100%|██████████| 250/250 [01:51<00:00,  2.25it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.06it/s]


Epoch 3/10:
Train Loss: 2.9635 | Val Loss: 3.5650


Training: 100%|██████████| 250/250 [01:51<00:00,  2.25it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.04it/s]


Epoch 4/10:
Train Loss: 2.9211 | Val Loss: 3.8254


Training: 100%|██████████| 250/250 [01:51<00:00,  2.25it/s]
Evaluating: 100%|██████████| 63/63 [00:10<00:00,  6.07it/s]
[I 2025-05-09 21:45:32,479] Trial 25 finished with value: 2.9777461309281605 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1421542788456474, 'learning_rate': 0.0006690409794861647, 'batch_size': 32}. Best is trial 23 with value: 0.908673295899043.


Epoch 5/10:
Train Loss: 2.9050 | Val Loss: 3.7293
Early stopping triggered!


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 1/10:
Train Loss: 2.3043 | Val Loss: 1.9307


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 2/10:
Train Loss: 1.8440 | Val Loss: 1.7332


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 3/10:
Train Loss: 1.6555 | Val Loss: 1.5472


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.53it/s]


Epoch 4/10:
Train Loss: 1.4904 | Val Loss: 1.4262


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 5/10:
Train Loss: 1.3595 | Val Loss: 1.3479


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 6/10:
Train Loss: 1.2519 | Val Loss: 1.2774


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 7/10:
Train Loss: 1.1535 | Val Loss: 1.2467


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 8/10:
Train Loss: 1.0648 | Val Loss: 1.1697


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 9/10:
Train Loss: 0.9855 | Val Loss: 1.1255


Training: 100%|██████████| 250/250 [02:28<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s]
[I 2025-05-09 22:12:31,334] Trial 26 finished with value: 1.125533419942099 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.17132793982131544, 'learning_rate': 0.0002238285642137414, 'batch_size': 32}. Best is trial 23 with value: 0.908673295899043.


Epoch 10/10:
Train Loss: 0.9098 | Val Loss: 1.1351


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.59it/s]


Epoch 1/10:
Train Loss: 3.0279 | Val Loss: 2.9848


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.59it/s]


Epoch 2/10:
Train Loss: 2.9828 | Val Loss: 2.9782


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.60it/s]


Epoch 3/10:
Train Loss: 2.9784 | Val Loss: 2.9760


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.60it/s]


Epoch 4/10:
Train Loss: 2.9745 | Val Loss: 3.1323


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.60it/s]


Epoch 5/10:
Train Loss: 2.9677 | Val Loss: 3.5115


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.59it/s]


Epoch 6/10:
Train Loss: 2.9772 | Val Loss: 2.9746


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.59it/s]


Epoch 7/10:
Train Loss: 2.9722 | Val Loss: 3.0878


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.59it/s]


Epoch 8/10:
Train Loss: 2.9670 | Val Loss: 3.2436


Training: 100%|██████████| 250/250 [02:27<00:00,  1.69it/s]
Evaluating: 100%|██████████| 63/63 [00:13<00:00,  4.59it/s]
[I 2025-05-09 22:36:45,495] Trial 27 finished with value: 2.9745784373510453 and parameters: {'d_model': 512, 'num_heads': 16, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.12538008889010294, 'learning_rate': 0.0016598895126507497, 'batch_size': 32}. Best is trial 23 with value: 0.908673295899043.


Epoch 9/10:
Train Loss: 2.9659 | Val Loss: 3.2411
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:30<00:00,  8.32it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.34it/s]


Epoch 1/10:
Train Loss: 2.2758 | Val Loss: 1.9397


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.41it/s]


Epoch 2/10:
Train Loss: 1.8662 | Val Loss: 1.7145


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.43it/s]


Epoch 3/10:
Train Loss: 1.6691 | Val Loss: 1.5263


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.38it/s]


Epoch 4/10:
Train Loss: 1.5195 | Val Loss: 1.3855


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.34it/s]


Epoch 5/10:
Train Loss: 1.3940 | Val Loss: 1.2599


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.33it/s]


Epoch 6/10:
Train Loss: 1.2829 | Val Loss: 1.1638


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.41it/s]


Epoch 7/10:
Train Loss: 1.1846 | Val Loss: 1.0822


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.36it/s]


Epoch 8/10:
Train Loss: 1.0941 | Val Loss: 0.9834


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.39it/s]


Epoch 9/10:
Train Loss: 0.9988 | Val Loss: 0.8905


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.30it/s]
[I 2025-05-09 22:42:14,015] Trial 28 finished with value: 0.8117749350411552 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.17914876146905312, 'learning_rate': 0.0003265002156121785, 'batch_size': 32}. Best is trial 28 with value: 0.8117749350411552.


Epoch 10/10:
Train Loss: 0.9094 | Val Loss: 0.8118
New best model found! Val Loss: 0.8118
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.17914876146905312, 'learning_rate': 0.0003265002156121785, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.99it/s]


Epoch 1/10:
Train Loss: 2.3133 | Val Loss: 2.0049


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.95it/s]


Epoch 2/10:
Train Loss: 1.9383 | Val Loss: 1.7522


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.95it/s]


Epoch 3/10:
Train Loss: 1.7391 | Val Loss: 1.6388


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.93it/s]


Epoch 4/10:
Train Loss: 1.6067 | Val Loss: 1.4822


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.94it/s]


Epoch 5/10:
Train Loss: 1.4979 | Val Loss: 1.3830


Training: 100%|██████████| 250/250 [00:39<00:00,  6.26it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.93it/s]


Epoch 6/10:
Train Loss: 1.4092 | Val Loss: 1.3107


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.92it/s]


Epoch 7/10:
Train Loss: 1.3320 | Val Loss: 1.2560


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.94it/s]


Epoch 8/10:
Train Loss: 1.2632 | Val Loss: 1.2013


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.96it/s]


Epoch 9/10:
Train Loss: 1.1961 | Val Loss: 1.1308


Training: 100%|██████████| 250/250 [00:39<00:00,  6.25it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.94it/s]
[I 2025-05-09 22:49:31,110] Trial 29 finished with value: 1.1030304573831105 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 8, 'd_ff': 256, 'dropout': 0.2207256154444274, 'learning_rate': 0.0003461776145409339, 'batch_size': 32}. Best is trial 28 with value: 0.8117749350411552.


Epoch 10/10:
Train Loss: 1.1315 | Val Loss: 1.1030


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.58it/s]


Epoch 1/10:
Train Loss: 2.3811 | Val Loss: 2.0806


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.60it/s]


Epoch 2/10:
Train Loss: 2.0228 | Val Loss: 1.9064


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.53it/s]


Epoch 3/10:
Train Loss: 1.8693 | Val Loss: 1.7764


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.60it/s]


Epoch 4/10:
Train Loss: 1.7561 | Val Loss: 1.6695


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.58it/s]


Epoch 5/10:
Train Loss: 1.6685 | Val Loss: 1.6188


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.59it/s]


Epoch 6/10:
Train Loss: 1.5844 | Val Loss: 1.5050


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.58it/s]


Epoch 7/10:
Train Loss: 1.5034 | Val Loss: 1.4402


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.58it/s]


Epoch 8/10:
Train Loss: 1.4326 | Val Loss: 1.3915


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.58it/s]


Epoch 9/10:
Train Loss: 1.3701 | Val Loss: 1.3203


Training: 100%|██████████| 250/250 [00:59<00:00,  4.19it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.57it/s]
[I 2025-05-09 23:00:22,342] Trial 30 finished with value: 1.2946009560236855 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.18149345335247732, 'learning_rate': 0.00017088592718411905, 'batch_size': 32}. Best is trial 28 with value: 0.8117749350411552.


Epoch 10/10:
Train Loss: 1.3134 | Val Loss: 1.2946


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.55it/s]


Epoch 1/10:
Train Loss: 2.2800 | Val Loss: 1.9524


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]


Epoch 2/10:
Train Loss: 1.8621 | Val Loss: 1.7619


Training: 100%|██████████| 250/250 [00:30<00:00,  8.32it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.51it/s]


Epoch 3/10:
Train Loss: 1.6522 | Val Loss: 1.4880


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.44it/s]


Epoch 4/10:
Train Loss: 1.4860 | Val Loss: 1.3586


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.56it/s]


Epoch 5/10:
Train Loss: 1.3581 | Val Loss: 1.2366


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]


Epoch 6/10:
Train Loss: 1.2443 | Val Loss: 1.1353


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.40it/s]


Epoch 7/10:
Train Loss: 1.1404 | Val Loss: 1.0307


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.51it/s]


Epoch 8/10:
Train Loss: 1.0359 | Val Loss: 0.9460


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.57it/s]


Epoch 9/10:
Train Loss: 0.9373 | Val Loss: 0.8343


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]
[I 2025-05-09 23:05:50,607] Trial 31 finished with value: 0.7597509527963305 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14277650757391272, 'learning_rate': 0.0002826759570443655, 'batch_size': 32}. Best is trial 31 with value: 0.7597509527963305.


Epoch 10/10:
Train Loss: 0.8442 | Val Loss: 0.7598
New best model found! Val Loss: 0.7598
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14277650757391272, 'learning_rate': 0.0002826759570443655, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.57it/s]


Epoch 1/10:
Train Loss: 2.2386 | Val Loss: 1.9074


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]


Epoch 2/10:
Train Loss: 1.8132 | Val Loss: 1.6305


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 3/10:
Train Loss: 1.5963 | Val Loss: 1.4651


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.63it/s]


Epoch 4/10:
Train Loss: 1.4450 | Val Loss: 1.3151


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.46it/s]


Epoch 5/10:
Train Loss: 1.3149 | Val Loss: 1.2111


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]


Epoch 6/10:
Train Loss: 1.2010 | Val Loss: 1.0982


Training: 100%|██████████| 250/250 [00:29<00:00,  8.35it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.62it/s]


Epoch 7/10:
Train Loss: 1.0871 | Val Loss: 0.9753


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.45it/s]


Epoch 8/10:
Train Loss: 0.9761 | Val Loss: 0.8750


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.52it/s]


Epoch 9/10:
Train Loss: 0.8578 | Val Loss: 0.7648


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]
[I 2025-05-09 23:11:18,721] Trial 32 finished with value: 0.6638958264910986 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14995549243047612, 'learning_rate': 0.0005434509602167445, 'batch_size': 32}. Best is trial 32 with value: 0.6638958264910986.


Epoch 10/10:
Train Loss: 0.7545 | Val Loss: 0.6639
New best model found! Val Loss: 0.6639
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14995549243047612, 'learning_rate': 0.0005434509602167445, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.65it/s]


Epoch 1/10:
Train Loss: 2.2239 | Val Loss: 1.9326


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 2/10:
Train Loss: 1.8393 | Val Loss: 1.6688


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 3/10:
Train Loss: 1.6390 | Val Loss: 1.5046


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.64it/s]


Epoch 4/10:
Train Loss: 1.5005 | Val Loss: 1.4177


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.49it/s]


Epoch 5/10:
Train Loss: 1.3817 | Val Loss: 1.2915


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]


Epoch 6/10:
Train Loss: 1.2797 | Val Loss: 1.1818


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]


Epoch 7/10:
Train Loss: 1.1855 | Val Loss: 1.1077


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]


Epoch 8/10:
Train Loss: 1.0909 | Val Loss: 1.0194


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]


Epoch 9/10:
Train Loss: 0.9897 | Val Loss: 0.9002


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]
[I 2025-05-09 23:16:46,658] Trial 33 finished with value: 0.8012307172729856 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14968624068604922, 'learning_rate': 0.0006445024347503818, 'batch_size': 32}. Best is trial 32 with value: 0.6638958264910986.


Epoch 10/10:
Train Loss: 0.8824 | Val Loss: 0.8012


Training: 100%|██████████| 250/250 [00:29<00:00,  8.35it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]


Epoch 1/10:
Train Loss: 2.2557 | Val Loss: 1.9117


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 2/10:
Train Loss: 1.8393 | Val Loss: 1.6739


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.64it/s]


Epoch 3/10:
Train Loss: 1.6338 | Val Loss: 1.5200


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 4/10:
Train Loss: 1.4945 | Val Loss: 1.4064


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.38it/s]


Epoch 5/10:
Train Loss: 1.3769 | Val Loss: 1.2843


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]


Epoch 6/10:
Train Loss: 1.2765 | Val Loss: 1.2047


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 7/10:
Train Loss: 1.1786 | Val Loss: 1.0886


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]


Epoch 8/10:
Train Loss: 1.0813 | Val Loss: 0.9896


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]


Epoch 9/10:
Train Loss: 0.9762 | Val Loss: 0.8913


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]
[I 2025-05-09 23:22:14,544] Trial 34 finished with value: 0.7726409709642804 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1544035924012794, 'learning_rate': 0.000614978740618603, 'batch_size': 32}. Best is trial 32 with value: 0.6638958264910986.


Epoch 10/10:
Train Loss: 0.8667 | Val Loss: 0.7726


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.65it/s]


Epoch 1/10:
Train Loss: 3.0089 | Val Loss: 2.9794


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.92it/s]


Epoch 2/10:
Train Loss: 2.9818 | Val Loss: 2.9737


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.88it/s]


Epoch 3/10:
Train Loss: 2.9599 | Val Loss: 3.6003


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.87it/s]


Epoch 4/10:
Train Loss: 2.9304 | Val Loss: 3.7087


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.90it/s]
[I 2025-05-09 23:24:58,376] Trial 35 finished with value: 2.97366924512954 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.14970152298154804, 'learning_rate': 0.001081507340145943, 'batch_size': 32}. Best is trial 32 with value: 0.6638958264910986.


Epoch 5/10:
Train Loss: 2.9184 | Val Loss: 3.8294
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.62it/s]


Epoch 1/10:
Train Loss: 2.2646 | Val Loss: 1.9410


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.64it/s]


Epoch 2/10:
Train Loss: 1.8869 | Val Loss: 1.7160


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.62it/s]


Epoch 3/10:
Train Loss: 1.6992 | Val Loss: 1.5662


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.55it/s]


Epoch 4/10:
Train Loss: 1.5614 | Val Loss: 1.4166


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]


Epoch 5/10:
Train Loss: 1.4408 | Val Loss: 1.3061


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]


Epoch 6/10:
Train Loss: 1.3337 | Val Loss: 1.1967


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 7/10:
Train Loss: 1.2373 | Val Loss: 1.1102


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.54it/s]


Epoch 8/10:
Train Loss: 1.1460 | Val Loss: 1.0228


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.57it/s]


Epoch 9/10:
Train Loss: 1.0488 | Val Loss: 0.9092


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.60it/s]
[I 2025-05-09 23:30:26,227] Trial 36 finished with value: 0.8102776663643974 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.22408145974183932, 'learning_rate': 0.0006236203568246677, 'batch_size': 32}. Best is trial 32 with value: 0.6638958264910986.


Epoch 10/10:
Train Loss: 0.9533 | Val Loss: 0.8103


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.63it/s]


Epoch 1/10:
Train Loss: 2.5559 | Val Loss: 2.0935


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.77it/s]


Epoch 2/10:
Train Loss: 2.0625 | Val Loss: 1.9170


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.75it/s]


Epoch 3/10:
Train Loss: 1.9323 | Val Loss: 1.8158


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.68it/s]


Epoch 4/10:
Train Loss: 1.8538 | Val Loss: 1.7561


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.62it/s]


Epoch 5/10:
Train Loss: 1.7911 | Val Loss: 1.6974


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.74it/s]


Epoch 6/10:
Train Loss: 1.7422 | Val Loss: 1.6670


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.74it/s]


Epoch 7/10:
Train Loss: 1.7056 | Val Loss: 1.6468


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.70it/s]


Epoch 8/10:
Train Loss: 1.6732 | Val Loss: 1.6303


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.76it/s]


Epoch 9/10:
Train Loss: 1.6399 | Val Loss: 1.6182


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.75it/s]
[I 2025-05-09 23:35:53,981] Trial 37 finished with value: 1.5778730312983196 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.30616079742668245, 'learning_rate': 0.0009459029518912723, 'batch_size': 32}. Best is trial 32 with value: 0.6638958264910986.


Epoch 10/10:
Train Loss: 1.6155 | Val Loss: 1.5779


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.56it/s]


Epoch 1/10:
Train Loss: 2.2302 | Val Loss: 1.8759


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 2/10:
Train Loss: 1.7871 | Val Loss: 1.5996


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]


Epoch 3/10:
Train Loss: 1.5539 | Val Loss: 1.3861


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 4/10:
Train Loss: 1.3766 | Val Loss: 1.2354


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.57it/s]


Epoch 5/10:
Train Loss: 1.2399 | Val Loss: 1.1076


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 6/10:
Train Loss: 1.1135 | Val Loss: 0.9873


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 7/10:
Train Loss: 0.9877 | Val Loss: 0.8825


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 8/10:
Train Loss: 0.8735 | Val Loss: 0.7565


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.57it/s]


Epoch 9/10:
Train Loss: 0.7727 | Val Loss: 0.6597


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.54it/s]
[I 2025-05-09 23:41:22,098] Trial 38 finished with value: 0.5989935639358702 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.16111256762581597, 'learning_rate': 0.00047200902053927076, 'batch_size': 32}. Best is trial 38 with value: 0.5989935639358702.


Epoch 10/10:
Train Loss: 0.6800 | Val Loss: 0.5990
New best model found! Val Loss: 0.5990
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.16111256762581597, 'learning_rate': 0.00047200902053927076, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.51it/s]


Epoch 1/10:
Train Loss: 2.3027 | Val Loss: 2.0088


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 2/10:
Train Loss: 1.9619 | Val Loss: 1.8031


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.56it/s]


Epoch 3/10:
Train Loss: 1.8101 | Val Loss: 1.6826


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.64it/s]


Epoch 4/10:
Train Loss: 1.6904 | Val Loss: 1.5698


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 5/10:
Train Loss: 1.5931 | Val Loss: 1.5010


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]


Epoch 6/10:
Train Loss: 1.5138 | Val Loss: 1.4098


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.56it/s]


Epoch 7/10:
Train Loss: 1.4428 | Val Loss: 1.4236


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.56it/s]


Epoch 8/10:
Train Loss: 1.3809 | Val Loss: 1.2821


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.65it/s]


Epoch 9/10:
Train Loss: 1.3231 | Val Loss: 1.2729


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.50it/s]
[I 2025-05-09 23:46:50,009] Trial 39 finished with value: 1.1781352607030717 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.28406028398725697, 'learning_rate': 0.0004949328859598576, 'batch_size': 32}. Best is trial 38 with value: 0.5989935639358702.


Epoch 10/10:
Train Loss: 1.2697 | Val Loss: 1.1781


Training: 100%|██████████| 250/250 [00:59<00:00,  4.22it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.89it/s]


Epoch 1/10:
Train Loss: 3.0092 | Val Loss: 3.1543


Training: 100%|██████████| 250/250 [00:59<00:00,  4.22it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.89it/s]


Epoch 2/10:
Train Loss: 2.9465 | Val Loss: 3.5624


Training: 100%|██████████| 250/250 [00:59<00:00,  4.22it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.88it/s]


Epoch 3/10:
Train Loss: 2.9236 | Val Loss: 3.6533


Training: 100%|██████████| 250/250 [00:59<00:00,  4.22it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 11.87it/s]
[I 2025-05-09 23:51:08,437] Trial 40 finished with value: 3.1543269914294045 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.33213285794184083, 'learning_rate': 0.0013215666288564128, 'batch_size': 32}. Best is trial 38 with value: 0.5989935639358702.


Epoch 4/10:
Train Loss: 2.9177 | Val Loss: 3.6968
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:29<00:00,  8.35it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.63it/s]


Epoch 1/10:
Train Loss: 2.2327 | Val Loss: 1.9027


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]


Epoch 2/10:
Train Loss: 1.8494 | Val Loss: 1.7334


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 3/10:
Train Loss: 1.6842 | Val Loss: 1.5785


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 4/10:
Train Loss: 1.5611 | Val Loss: 1.5028


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.63it/s]


Epoch 5/10:
Train Loss: 1.4648 | Val Loss: 1.3857


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.51it/s]


Epoch 6/10:
Train Loss: 1.3811 | Val Loss: 1.3221


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.63it/s]


Epoch 7/10:
Train Loss: 1.3091 | Val Loss: 1.2528


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.63it/s]


Epoch 8/10:
Train Loss: 1.2375 | Val Loss: 1.2139


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.55it/s]


Epoch 9/10:
Train Loss: 1.1724 | Val Loss: 1.1338


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]
[I 2025-05-09 23:56:36,194] Trial 41 finished with value: 1.078269208234454 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1590811541968748, 'learning_rate': 0.0007937464845492947, 'batch_size': 32}. Best is trial 38 with value: 0.5989935639358702.


Epoch 10/10:
Train Loss: 1.1079 | Val Loss: 1.0783


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.62it/s]


Epoch 1/10:
Train Loss: 2.2305 | Val Loss: 1.8953


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 2/10:
Train Loss: 1.7827 | Val Loss: 1.6101


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.47it/s]


Epoch 3/10:
Train Loss: 1.5732 | Val Loss: 1.4137


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.61it/s]


Epoch 4/10:
Train Loss: 1.3901 | Val Loss: 1.2494


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.62it/s]


Epoch 5/10:
Train Loss: 1.2363 | Val Loss: 1.1140


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.57it/s]


Epoch 6/10:
Train Loss: 1.0876 | Val Loss: 0.9473


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.47it/s]


Epoch 7/10:
Train Loss: 0.9358 | Val Loss: 0.7945


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.49it/s]


Epoch 8/10:
Train Loss: 0.7979 | Val Loss: 0.6712


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.56it/s]


Epoch 9/10:
Train Loss: 0.6830 | Val Loss: 0.5875


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.53it/s]
[I 2025-05-10 00:02:04,370] Trial 42 finished with value: 0.5212223577120948 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.13890358426055002, 'learning_rate': 0.0005209684802796529, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 0.5848 | Val Loss: 0.5212
New best model found! Val Loss: 0.5212
Config: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.13890358426055002, 'learning_rate': 0.0005209684802796529, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.83it/s]


Epoch 1/10:
Train Loss: 3.0095 | Val Loss: 2.9791


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.84it/s]


Epoch 2/10:
Train Loss: 2.9791 | Val Loss: 3.0688


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.82it/s]


Epoch 3/10:
Train Loss: 2.9567 | Val Loss: 3.3848


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.80it/s]
[I 2025-05-10 00:04:15,531] Trial 43 finished with value: 2.9790564870077465 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.12963082230610562, 'learning_rate': 0.002288498273524004, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 4/10:
Train Loss: 2.9405 | Val Loss: 3.4073
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.58it/s]


Epoch 1/10:
Train Loss: 2.2421 | Val Loss: 1.9238


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.51it/s]


Epoch 2/10:
Train Loss: 1.8206 | Val Loss: 1.6322


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.49it/s]


Epoch 3/10:
Train Loss: 1.6136 | Val Loss: 1.4498


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.40it/s]


Epoch 4/10:
Train Loss: 1.4461 | Val Loss: 1.2983


Training: 100%|██████████| 250/250 [00:29<00:00,  8.34it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.59it/s]


Epoch 5/10:
Train Loss: 1.3131 | Val Loss: 1.1669


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.51it/s]


Epoch 6/10:
Train Loss: 1.1856 | Val Loss: 1.0355


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.46it/s]


Epoch 7/10:
Train Loss: 1.0564 | Val Loss: 0.8940


Training: 100%|██████████| 250/250 [00:30<00:00,  8.32it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.47it/s]


Epoch 8/10:
Train Loss: 0.9323 | Val Loss: 0.7855


Training: 100%|██████████| 250/250 [00:29<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.50it/s]


Epoch 9/10:
Train Loss: 0.8249 | Val Loss: 0.7073


Training: 100%|██████████| 250/250 [00:30<00:00,  8.33it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.42it/s]
[I 2025-05-10 00:09:43,752] Trial 44 finished with value: 0.6243769767738524 and parameters: {'d_model': 256, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1885588060783291, 'learning_rate': 0.0004565976696878553, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 0.7304 | Val Loss: 0.6244


Training: 100%|██████████| 250/250 [00:21<00:00, 11.56it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.91it/s]


Epoch 1/10:
Train Loss: 2.4363 | Val Loss: 2.0879


Training: 100%|██████████| 250/250 [00:21<00:00, 11.56it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.77it/s]


Epoch 2/10:
Train Loss: 2.0514 | Val Loss: 1.9005


Training: 100%|██████████| 250/250 [00:21<00:00, 11.55it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.89it/s]


Epoch 3/10:
Train Loss: 1.9002 | Val Loss: 1.7690


Training: 100%|██████████| 250/250 [00:21<00:00, 11.54it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.83it/s]


Epoch 4/10:
Train Loss: 1.7784 | Val Loss: 1.6483


Training: 100%|██████████| 250/250 [00:21<00:00, 11.53it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.83it/s]


Epoch 5/10:
Train Loss: 1.6876 | Val Loss: 1.5730


Training: 100%|██████████| 250/250 [00:21<00:00, 11.55it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.90it/s]


Epoch 6/10:
Train Loss: 1.6121 | Val Loss: 1.4921


Training: 100%|██████████| 250/250 [00:21<00:00, 11.56it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.76it/s]


Epoch 7/10:
Train Loss: 1.5527 | Val Loss: 1.4502


Training: 100%|██████████| 250/250 [00:21<00:00, 11.54it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.78it/s]


Epoch 8/10:
Train Loss: 1.4997 | Val Loss: 1.4057


Training: 100%|██████████| 250/250 [00:21<00:00, 11.54it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.86it/s]


Epoch 9/10:
Train Loss: 1.4537 | Val Loss: 1.3582


Training: 100%|██████████| 250/250 [00:21<00:00, 11.52it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.55it/s]
[I 2025-05-10 00:13:40,869] Trial 45 finished with value: 1.3060478236940172 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.20666584931194765, 'learning_rate': 0.00043177641042276476, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 1.4099 | Val Loss: 1.3060


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.26it/s]


Epoch 1/10:
Train Loss: 2.2893 | Val Loss: 1.9484


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.35it/s]


Epoch 2/10:
Train Loss: 1.8655 | Val Loss: 1.6871


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.98it/s]


Epoch 3/10:
Train Loss: 1.6425 | Val Loss: 1.4757


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.26it/s]


Epoch 4/10:
Train Loss: 1.4618 | Val Loss: 1.3175


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.13it/s]


Epoch 5/10:
Train Loss: 1.3250 | Val Loss: 1.1759


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.25it/s]


Epoch 6/10:
Train Loss: 1.2049 | Val Loss: 1.0650


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.12it/s]


Epoch 7/10:
Train Loss: 1.0917 | Val Loss: 0.9531


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.29it/s]


Epoch 8/10:
Train Loss: 0.9821 | Val Loss: 0.8441


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.28it/s]


Epoch 9/10:
Train Loss: 0.8820 | Val Loss: 0.7765


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.26it/s]
[I 2025-05-10 00:17:43,279] Trial 46 finished with value: 0.7063389940867348 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.13844035584271566, 'learning_rate': 0.00028550276470407744, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 0.7987 | Val Loss: 0.7063


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.25it/s]


Epoch 1/10:
Train Loss: 2.2484 | Val Loss: 1.9297


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.18it/s]


Epoch 2/10:
Train Loss: 1.8281 | Val Loss: 1.6459


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.26it/s]


Epoch 3/10:
Train Loss: 1.6113 | Val Loss: 1.4518


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.27it/s]


Epoch 4/10:
Train Loss: 1.4242 | Val Loss: 1.2558


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.31it/s]


Epoch 5/10:
Train Loss: 1.2731 | Val Loss: 1.1109


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.29it/s]


Epoch 6/10:
Train Loss: 1.1321 | Val Loss: 0.9683


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.28it/s]


Epoch 7/10:
Train Loss: 0.9951 | Val Loss: 0.8213


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.28it/s]


Epoch 8/10:
Train Loss: 0.8827 | Val Loss: 0.7242


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.29it/s]


Epoch 9/10:
Train Loss: 0.7820 | Val Loss: 0.6358


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.12it/s]
[I 2025-05-10 00:21:45,667] Trial 47 finished with value: 0.5773443991229648 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1793860423711977, 'learning_rate': 0.0005266268434549198, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 0.6961 | Val Loss: 0.5773


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.22it/s]


Epoch 1/10:
Train Loss: 2.2613 | Val Loss: 1.9192


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.01it/s]


Epoch 2/10:
Train Loss: 1.8724 | Val Loss: 1.7453


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.10it/s]


Epoch 3/10:
Train Loss: 1.7082 | Val Loss: 1.5903


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch 4/10:
Train Loss: 1.5742 | Val Loss: 1.4987


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.08it/s]


Epoch 5/10:
Train Loss: 1.4670 | Val Loss: 1.3868


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.40it/s]


Epoch 6/10:
Train Loss: 1.3717 | Val Loss: 1.2781


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.40it/s]


Epoch 7/10:
Train Loss: 1.2878 | Val Loss: 1.2031


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]


Epoch 8/10:
Train Loss: 1.2065 | Val Loss: 1.1243


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.09it/s]


Epoch 9/10:
Train Loss: 1.1216 | Val Loss: 1.0278


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]
[I 2025-05-10 00:26:19,793] Trial 48 finished with value: 0.9383213217296298 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1884950444461152, 'learning_rate': 0.0005049716280350072, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 1.0345 | Val Loss: 0.9383


Training: 100%|██████████| 250/250 [00:26<00:00,  9.50it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.00it/s]


Epoch 1/10:
Train Loss: 2.4655 | Val Loss: 2.1045


Training: 100%|██████████| 250/250 [00:26<00:00,  9.46it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]


Epoch 2/10:
Train Loss: 2.0687 | Val Loss: 1.9266


Training: 100%|██████████| 250/250 [00:26<00:00,  9.45it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.82it/s]


Epoch 3/10:
Train Loss: 1.9225 | Val Loss: 1.7966


Training: 100%|██████████| 250/250 [00:26<00:00,  9.47it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 4/10:
Train Loss: 1.8285 | Val Loss: 1.7808


Training: 100%|██████████| 250/250 [00:26<00:00,  9.42it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.80it/s]


Epoch 5/10:
Train Loss: 1.7573 | Val Loss: 1.6659


Training: 100%|██████████| 250/250 [00:26<00:00,  9.48it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.04it/s]


Epoch 6/10:
Train Loss: 1.6975 | Val Loss: 1.6047


Training: 100%|██████████| 250/250 [00:26<00:00,  9.40it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.71it/s]


Epoch 7/10:
Train Loss: 1.6441 | Val Loss: 1.5547


Training: 100%|██████████| 250/250 [00:26<00:00,  9.41it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.82it/s]


Epoch 8/10:
Train Loss: 1.5977 | Val Loss: 1.5813


Training: 100%|██████████| 250/250 [00:26<00:00,  9.45it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.53it/s]


Epoch 9/10:
Train Loss: 1.5528 | Val Loss: 1.4639


Training: 100%|██████████| 250/250 [00:26<00:00,  9.44it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.48it/s]
[I 2025-05-10 00:31:09,106] Trial 49 finished with value: 1.463872029667809 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 12, 'd_ff': 256, 'dropout': 0.23956529448028663, 'learning_rate': 0.00040771632624197785, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 1.5117 | Val Loss: 1.4762


Training: 100%|██████████| 250/250 [00:50<00:00,  4.96it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.91it/s]


Epoch 1/10:
Train Loss: 2.9393 | Val Loss: 2.3991


Training: 100%|██████████| 250/250 [00:50<00:00,  4.95it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.87it/s]


Epoch 2/10:
Train Loss: 2.3280 | Val Loss: 2.5965


Training: 100%|██████████| 250/250 [00:50<00:00,  4.95it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.94it/s]


Epoch 3/10:
Train Loss: 2.2514 | Val Loss: 3.0616


Training: 100%|██████████| 250/250 [00:50<00:00,  4.95it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 12.97it/s]
[I 2025-05-10 00:34:50,729] Trial 50 finished with value: 2.3991034068758528 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.2112702021398985, 'learning_rate': 0.0012407638575130096, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 4/10:
Train Loss: 2.2172 | Val Loss: 3.3889
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.20it/s]


Epoch 1/10:
Train Loss: 2.2880 | Val Loss: 1.9944


Training: 100%|██████████| 250/250 [00:22<00:00, 11.31it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.29it/s]


Epoch 2/10:
Train Loss: 1.8987 | Val Loss: 1.7852


Training: 100%|██████████| 250/250 [00:22<00:00, 11.31it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.30it/s]


Epoch 3/10:
Train Loss: 1.7662 | Val Loss: 1.6755


Training: 100%|██████████| 250/250 [00:22<00:00, 11.32it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.39it/s]


Epoch 4/10:
Train Loss: 1.6765 | Val Loss: 1.6232


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.28it/s]


Epoch 5/10:
Train Loss: 1.5990 | Val Loss: 1.5181


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.23it/s]


Epoch 6/10:
Train Loss: 1.5241 | Val Loss: 1.4699


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.25it/s]


Epoch 7/10:
Train Loss: 1.4590 | Val Loss: 1.4097


Training: 100%|██████████| 250/250 [00:22<00:00, 11.31it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.28it/s]


Epoch 8/10:
Train Loss: 1.3988 | Val Loss: 1.3231


Training: 100%|██████████| 250/250 [00:22<00:00, 11.31it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.17it/s]


Epoch 9/10:
Train Loss: 1.3340 | Val Loss: 1.2714


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.18it/s]
[I 2025-05-10 00:38:52,866] Trial 51 finished with value: 1.2166792040779477 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.16655018822155984, 'learning_rate': 0.0007900037555359251, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 1.2707 | Val Loss: 1.2167


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 29.98it/s]


Epoch 1/10:
Train Loss: 2.2815 | Val Loss: 1.9672


Training: 100%|██████████| 250/250 [00:22<00:00, 11.27it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.10it/s]


Epoch 2/10:
Train Loss: 1.8723 | Val Loss: 1.6906


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.14it/s]


Epoch 3/10:
Train Loss: 1.6673 | Val Loss: 1.5191


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.08it/s]


Epoch 4/10:
Train Loss: 1.5082 | Val Loss: 1.3471


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.20it/s]


Epoch 5/10:
Train Loss: 1.3729 | Val Loss: 1.2187


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.23it/s]


Epoch 6/10:
Train Loss: 1.2519 | Val Loss: 1.0818


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.20it/s]


Epoch 7/10:
Train Loss: 1.1361 | Val Loss: 0.9854


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.16it/s]


Epoch 8/10:
Train Loss: 1.0381 | Val Loss: 0.8829


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.17it/s]


Epoch 9/10:
Train Loss: 0.9421 | Val Loss: 0.7992


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.25it/s]
[I 2025-05-10 00:42:55,423] Trial 52 finished with value: 0.7264760231214856 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.17361265537136386, 'learning_rate': 0.0003102131155122711, 'batch_size': 32}. Best is trial 42 with value: 0.5212223577120948.


Epoch 10/10:
Train Loss: 0.8593 | Val Loss: 0.7265


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.28it/s]


Epoch 1/10:
Train Loss: 2.2169 | Val Loss: 1.8716


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.26it/s]


Epoch 2/10:
Train Loss: 1.7768 | Val Loss: 1.6019


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.25it/s]


Epoch 3/10:
Train Loss: 1.5517 | Val Loss: 1.4065


Training: 100%|██████████| 250/250 [00:22<00:00, 11.31it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.27it/s]


Epoch 4/10:
Train Loss: 1.3625 | Val Loss: 1.2162


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.22it/s]


Epoch 5/10:
Train Loss: 1.1781 | Val Loss: 1.0187


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.19it/s]


Epoch 6/10:
Train Loss: 1.0026 | Val Loss: 0.8432


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.26it/s]


Epoch 7/10:
Train Loss: 0.8474 | Val Loss: 0.6947


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.27it/s]


Epoch 8/10:
Train Loss: 0.7118 | Val Loss: 0.6080


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.30it/s]


Epoch 9/10:
Train Loss: 0.6114 | Val Loss: 0.5262


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.33it/s]
[I 2025-05-10 00:46:57,811] Trial 53 finished with value: 0.4700953596168094 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1182772347458643, 'learning_rate': 0.0005607918906269759, 'batch_size': 32}. Best is trial 53 with value: 0.4700953596168094.


Epoch 10/10:
Train Loss: 0.5252 | Val Loss: 0.4701
New best model found! Val Loss: 0.4701
Config: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1182772347458643, 'learning_rate': 0.0005607918906269759, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:41<00:00,  6.07it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.59it/s]


Epoch 1/10:
Train Loss: 2.5634 | Val Loss: 2.1383


Training: 100%|██████████| 250/250 [00:41<00:00,  6.01it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.80it/s]


Epoch 2/10:
Train Loss: 2.1065 | Val Loss: 1.9463


Training: 100%|██████████| 250/250 [00:41<00:00,  6.01it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.57it/s]


Epoch 3/10:
Train Loss: 1.9634 | Val Loss: 1.8360


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.77it/s]


Epoch 4/10:
Train Loss: 1.8738 | Val Loss: 1.7616


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.74it/s]


Epoch 5/10:
Train Loss: 1.8062 | Val Loss: 1.7072


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.73it/s]


Epoch 6/10:
Train Loss: 1.7519 | Val Loss: 1.6740


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.83it/s]


Epoch 7/10:
Train Loss: 1.7077 | Val Loss: 1.6448


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.63it/s]


Epoch 8/10:
Train Loss: 1.6692 | Val Loss: 1.5837


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.75it/s]


Epoch 9/10:
Train Loss: 1.6311 | Val Loss: 1.5547


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.79it/s]
[I 2025-05-10 00:54:30,892] Trial 54 finished with value: 1.5445851276791285 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.3989209080072861, 'learning_rate': 0.0003843561268104049, 'batch_size': 32}. Best is trial 53 with value: 0.4700953596168094.


Epoch 10/10:
Train Loss: 1.6022 | Val Loss: 1.5446


Training: 100%|██████████| 250/250 [00:22<00:00, 11.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.18it/s]


Epoch 1/10:
Train Loss: 2.2099 | Val Loss: 1.8477


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.22it/s]


Epoch 2/10:
Train Loss: 1.7110 | Val Loss: 1.4741


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.04it/s]


Epoch 3/10:
Train Loss: 1.4255 | Val Loss: 1.2451


Training: 100%|██████████| 250/250 [00:22<00:00, 11.28it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.20it/s]


Epoch 4/10:
Train Loss: 1.2077 | Val Loss: 1.0283


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.16it/s]


Epoch 5/10:
Train Loss: 1.0163 | Val Loss: 0.8584


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.16it/s]


Epoch 6/10:
Train Loss: 0.8603 | Val Loss: 0.7271


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.22it/s]


Epoch 7/10:
Train Loss: 0.7299 | Val Loss: 0.6203


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.27it/s]


Epoch 8/10:
Train Loss: 0.6248 | Val Loss: 0.5599


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.07it/s]


Epoch 9/10:
Train Loss: 0.5453 | Val Loss: 0.5033


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.27it/s]
[I 2025-05-10 00:58:33,525] Trial 55 finished with value: 0.45232788295972914 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1162971636150338, 'learning_rate': 0.0005318558322094272, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.4777 | Val Loss: 0.4523
New best model found! Val Loss: 0.4523
Config: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.1162971636150338, 'learning_rate': 0.0005318558322094272, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:30<00:00,  8.24it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.12it/s]


Epoch 1/10:
Train Loss: 2.3617 | Val Loss: 1.9718


Training: 100%|██████████| 250/250 [00:30<00:00,  8.25it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.24it/s]


Epoch 2/10:
Train Loss: 1.8989 | Val Loss: 1.8270


Training: 100%|██████████| 250/250 [00:30<00:00,  8.26it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.40it/s]


Epoch 3/10:
Train Loss: 1.7396 | Val Loss: 1.8235


Training: 100%|██████████| 250/250 [00:30<00:00,  8.24it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.30it/s]


Epoch 4/10:
Train Loss: 1.6448 | Val Loss: 2.1341


Training: 100%|██████████| 250/250 [00:30<00:00,  8.25it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.29it/s]


Epoch 5/10:
Train Loss: 1.5715 | Val Loss: 2.0831


Training: 100%|██████████| 250/250 [00:30<00:00,  8.25it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.41it/s]
[I 2025-05-10 01:01:52,533] Trial 56 finished with value: 1.8235365738944402 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.11413543493683063, 'learning_rate': 0.0008150796938520326, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 6/10:
Train Loss: 1.5155 | Val Loss: 2.3293
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:13<00:00, 18.20it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 50.15it/s]


Epoch 1/10:
Train Loss: 2.3243 | Val Loss: 1.9946


Training: 100%|██████████| 250/250 [00:13<00:00, 18.30it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 50.31it/s]


Epoch 2/10:
Train Loss: 1.9014 | Val Loss: 1.7167


Training: 100%|██████████| 250/250 [00:13<00:00, 18.33it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 49.78it/s]


Epoch 3/10:
Train Loss: 1.7041 | Val Loss: 1.5484


Training: 100%|██████████| 250/250 [00:13<00:00, 18.27it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 50.53it/s]


Epoch 4/10:
Train Loss: 1.5719 | Val Loss: 1.4479


Training: 100%|██████████| 250/250 [00:13<00:00, 18.16it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 49.86it/s]


Epoch 5/10:
Train Loss: 1.4624 | Val Loss: 1.3461


Training: 100%|██████████| 250/250 [00:13<00:00, 18.13it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 49.85it/s]


Epoch 6/10:
Train Loss: 1.3618 | Val Loss: 1.2352


Training: 100%|██████████| 250/250 [00:13<00:00, 18.21it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 50.14it/s]


Epoch 7/10:
Train Loss: 1.2719 | Val Loss: 1.1455


Training: 100%|██████████| 250/250 [00:13<00:00, 18.05it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 50.14it/s]


Epoch 8/10:
Train Loss: 1.1818 | Val Loss: 1.0412


Training: 100%|██████████| 250/250 [00:13<00:00, 18.09it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 49.63it/s]


Epoch 9/10:
Train Loss: 1.0857 | Val Loss: 0.9476


Training: 100%|██████████| 250/250 [00:13<00:00, 18.09it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 49.84it/s]
[I 2025-05-10 01:04:22,741] Trial 57 finished with value: 0.8422559736266969 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.11376457689115603, 'learning_rate': 0.0005542839428349746, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.9931 | Val Loss: 0.8423


Training: 100%|██████████| 250/250 [00:22<00:00, 11.31it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.43it/s]


Epoch 1/10:
Train Loss: 2.2725 | Val Loss: 1.9844


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.35it/s]


Epoch 2/10:
Train Loss: 1.8942 | Val Loss: 1.7246


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.31it/s]


Epoch 3/10:
Train Loss: 1.6949 | Val Loss: 1.6069


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.33it/s]


Epoch 4/10:
Train Loss: 1.5535 | Val Loss: 1.4040


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.26it/s]


Epoch 5/10:
Train Loss: 1.4243 | Val Loss: 1.2981


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.11it/s]


Epoch 6/10:
Train Loss: 1.3161 | Val Loss: 1.1685


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.32it/s]


Epoch 7/10:
Train Loss: 1.2093 | Val Loss: 1.0519


Training: 100%|██████████| 250/250 [00:22<00:00, 11.30it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.34it/s]


Epoch 8/10:
Train Loss: 1.1059 | Val Loss: 0.9342


Training: 100%|██████████| 250/250 [00:22<00:00, 11.31it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.21it/s]


Epoch 9/10:
Train Loss: 0.9953 | Val Loss: 0.8482


Training: 100%|██████████| 250/250 [00:22<00:00, 11.29it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 30.21it/s]
[I 2025-05-10 01:08:24,946] Trial 58 finished with value: 0.7524760365486145 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 256, 'dropout': 0.19715062427588245, 'learning_rate': 0.0004698926806248208, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.8998 | Val Loss: 0.7525


Training: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.54it/s]


Epoch 1/10:
Train Loss: 3.0104 | Val Loss: 2.9777


Training: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.64it/s]


Epoch 2/10:
Train Loss: 2.9730 | Val Loss: 3.1440


Training: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.56it/s]


Epoch 3/10:
Train Loss: 2.9521 | Val Loss: 3.4994


Training: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.55it/s]
[I 2025-05-10 01:12:06,611] Trial 59 finished with value: 2.9777108071342346 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 10, 'd_ff': 1024, 'dropout': 0.11578540359688202, 'learning_rate': 0.0007504655651248603, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 4/10:
Train Loss: 2.9395 | Val Loss: 3.6272
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.91it/s]


Epoch 1/10:
Train Loss: 2.2394 | Val Loss: 1.9022


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.09it/s]


Epoch 2/10:
Train Loss: 1.8112 | Val Loss: 1.6439


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch 3/10:
Train Loss: 1.5821 | Val Loss: 1.4313


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 4/10:
Train Loss: 1.3983 | Val Loss: 1.2380


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.17it/s]


Epoch 5/10:
Train Loss: 1.2446 | Val Loss: 1.0973


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.17it/s]


Epoch 6/10:
Train Loss: 1.0960 | Val Loss: 0.9362


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.30it/s]


Epoch 7/10:
Train Loss: 0.9412 | Val Loss: 0.8004


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.22it/s]


Epoch 8/10:
Train Loss: 0.8105 | Val Loss: 0.6824


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.10it/s]


Epoch 9/10:
Train Loss: 0.6924 | Val Loss: 0.6086


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.36it/s]
[I 2025-05-10 01:16:40,798] Trial 60 finished with value: 0.5123373908655984 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.13497856817927345, 'learning_rate': 0.0003757525495074411, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.5949 | Val Loss: 0.5123


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.00it/s]


Epoch 1/10:
Train Loss: 3.0097 | Val Loss: 2.9814


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.09it/s]


Epoch 2/10:
Train Loss: 2.9793 | Val Loss: 2.9811


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.16it/s]


Epoch 3/10:
Train Loss: 2.9777 | Val Loss: 2.9769


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.22it/s]


Epoch 4/10:
Train Loss: 2.9768 | Val Loss: 2.9821


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.10it/s]


Epoch 5/10:
Train Loss: 2.9762 | Val Loss: 2.9802


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.06it/s]


Epoch 6/10:
Train Loss: 2.9755 | Val Loss: 2.9762


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.15it/s]


Epoch 7/10:
Train Loss: 2.9758 | Val Loss: 2.9749


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.97it/s]


Epoch 8/10:
Train Loss: 2.9755 | Val Loss: 2.9733


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.18it/s]


Epoch 9/10:
Train Loss: 2.9748 | Val Loss: 2.9725


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.18it/s]
[I 2025-05-10 01:21:14,097] Trial 61 finished with value: 2.9725342705136253 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1357567443619642, 'learning_rate': 0.008173721366368479, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 2.9753 | Val Loss: 2.9736


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.49it/s]


Epoch 1/10:
Train Loss: 2.2167 | Val Loss: 1.8819


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]


Epoch 2/10:
Train Loss: 1.7848 | Val Loss: 1.6374


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.50it/s]


Epoch 3/10:
Train Loss: 1.5649 | Val Loss: 1.4082


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.58it/s]


Epoch 4/10:
Train Loss: 1.3893 | Val Loss: 1.2540


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.49it/s]


Epoch 5/10:
Train Loss: 1.2262 | Val Loss: 1.0859


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.41it/s]


Epoch 6/10:
Train Loss: 1.0705 | Val Loss: 0.9351


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.47it/s]


Epoch 7/10:
Train Loss: 0.9171 | Val Loss: 0.7907


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.48it/s]


Epoch 8/10:
Train Loss: 0.7798 | Val Loss: 0.6731


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.51it/s]


Epoch 9/10:
Train Loss: 0.6648 | Val Loss: 0.5704


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.40it/s]
[I 2025-05-10 01:25:47,877] Trial 62 finished with value: 0.4787990280560085 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12141778060320159, 'learning_rate': 0.000385535382865101, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.5632 | Val Loss: 0.4788


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.38it/s]


Epoch 1/10:
Train Loss: 2.2192 | Val Loss: 1.9050


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.52it/s]


Epoch 2/10:
Train Loss: 1.7838 | Val Loss: 1.5768


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.31it/s]


Epoch 3/10:
Train Loss: 1.5189 | Val Loss: 1.3344


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.48it/s]


Epoch 4/10:
Train Loss: 1.3121 | Val Loss: 1.1676


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.50it/s]


Epoch 5/10:
Train Loss: 1.1427 | Val Loss: 0.9829


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.56it/s]


Epoch 6/10:
Train Loss: 0.9815 | Val Loss: 0.8405


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.49it/s]


Epoch 7/10:
Train Loss: 0.8390 | Val Loss: 0.7204


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.50it/s]


Epoch 8/10:
Train Loss: 0.7132 | Val Loss: 0.6355


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.44it/s]


Epoch 9/10:
Train Loss: 0.6136 | Val Loss: 0.5366


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.38it/s]
[I 2025-05-10 01:30:21,711] Trial 63 finished with value: 0.49323016310495044 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12207824561554104, 'learning_rate': 0.0003695919447207366, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.5294 | Val Loss: 0.4932


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.52it/s]


Epoch 1/10:
Train Loss: 2.2282 | Val Loss: 1.9255


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.52it/s]


Epoch 2/10:
Train Loss: 1.7905 | Val Loss: 1.5955


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.47it/s]


Epoch 3/10:
Train Loss: 1.5424 | Val Loss: 1.3673


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.39it/s]


Epoch 4/10:
Train Loss: 1.3374 | Val Loss: 1.1867


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.51it/s]


Epoch 5/10:
Train Loss: 1.1717 | Val Loss: 1.0160


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.48it/s]


Epoch 6/10:
Train Loss: 1.0063 | Val Loss: 0.8638


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.49it/s]


Epoch 7/10:
Train Loss: 0.8614 | Val Loss: 0.7253


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.52it/s]


Epoch 8/10:
Train Loss: 0.7322 | Val Loss: 0.6267


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.51it/s]


Epoch 9/10:
Train Loss: 0.6334 | Val Loss: 0.5668


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]
[I 2025-05-10 01:34:55,566] Trial 64 finished with value: 0.49286497774578275 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12007075025888009, 'learning_rate': 0.00035304674583478963, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.5455 | Val Loss: 0.4929


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.36it/s]


Epoch 1/10:
Train Loss: 2.2344 | Val Loss: 1.9008


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.53it/s]


Epoch 2/10:
Train Loss: 1.7862 | Val Loss: 1.6106


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.46it/s]


Epoch 3/10:
Train Loss: 1.5369 | Val Loss: 1.3623


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.37it/s]


Epoch 4/10:
Train Loss: 1.3354 | Val Loss: 1.1758


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.43it/s]


Epoch 5/10:
Train Loss: 1.1661 | Val Loss: 1.0094


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.50it/s]


Epoch 6/10:
Train Loss: 1.0019 | Val Loss: 0.8615


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.47it/s]


Epoch 7/10:
Train Loss: 0.8520 | Val Loss: 0.7380


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.44it/s]


Epoch 8/10:
Train Loss: 0.7244 | Val Loss: 0.6420


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.48it/s]


Epoch 9/10:
Train Loss: 0.6254 | Val Loss: 0.5410


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.46it/s]
[I 2025-05-10 01:39:29,427] Trial 65 finished with value: 0.5039291533212813 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.11925349171917993, 'learning_rate': 0.0003608961863511, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.5354 | Val Loss: 0.5039


Training: 100%|██████████| 250/250 [00:49<00:00,  5.02it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.82it/s]


Epoch 1/10:
Train Loss: 2.8747 | Val Loss: 2.2841


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.91it/s]


Epoch 2/10:
Train Loss: 2.0995 | Val Loss: 1.9375


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.93it/s]


Epoch 3/10:
Train Loss: 1.8707 | Val Loss: 1.7563


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.93it/s]


Epoch 4/10:
Train Loss: 1.7349 | Val Loss: 1.6800


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.94it/s]


Epoch 5/10:
Train Loss: 1.6461 | Val Loss: 1.6051


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.96it/s]


Epoch 6/10:
Train Loss: 1.5680 | Val Loss: 1.5670


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.95it/s]


Epoch 7/10:
Train Loss: 1.5048 | Val Loss: 1.6044


Training: 100%|██████████| 250/250 [00:49<00:00,  5.03it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.96it/s]


Epoch 8/10:
Train Loss: 1.4483 | Val Loss: 1.8706


Training: 100%|██████████| 250/250 [00:49<00:00,  5.02it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.97it/s]
[I 2025-05-10 01:47:37,897] Trial 66 finished with value: 1.566980174609593 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.1173380295100793, 'learning_rate': 0.000366283818052268, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 9/10:
Train Loss: 1.4031 | Val Loss: 2.2164
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:15<00:00, 16.64it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.99it/s]


Epoch 1/10:
Train Loss: 2.6356 | Val Loss: 2.2231


Training: 100%|██████████| 250/250 [00:15<00:00, 16.58it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.88it/s]


Epoch 2/10:
Train Loss: 2.1649 | Val Loss: 2.0400


Training: 100%|██████████| 250/250 [00:15<00:00, 16.63it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.92it/s]


Epoch 3/10:
Train Loss: 2.0429 | Val Loss: 1.9262


Training: 100%|██████████| 250/250 [00:15<00:00, 16.65it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.81it/s]


Epoch 4/10:
Train Loss: 1.9531 | Val Loss: 1.8296


Training: 100%|██████████| 250/250 [00:15<00:00, 16.61it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.58it/s]


Epoch 5/10:
Train Loss: 1.8759 | Val Loss: 1.7509


Training: 100%|██████████| 250/250 [00:15<00:00, 16.62it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.96it/s]


Epoch 6/10:
Train Loss: 1.8127 | Val Loss: 1.6934


Training: 100%|██████████| 250/250 [00:15<00:00, 16.65it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.67it/s]


Epoch 7/10:
Train Loss: 1.7525 | Val Loss: 1.6493


Training: 100%|██████████| 250/250 [00:15<00:00, 16.57it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.85it/s]


Epoch 8/10:
Train Loss: 1.6994 | Val Loss: 1.5890


Training: 100%|██████████| 250/250 [00:15<00:00, 16.61it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.90it/s]


Epoch 9/10:
Train Loss: 1.6519 | Val Loss: 1.5641


Training: 100%|██████████| 250/250 [00:15<00:00, 16.59it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.82it/s]
[I 2025-05-10 01:50:22,857] Trial 67 finished with value: 1.5051956782265314 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12802605062358996, 'learning_rate': 0.0001272497139016227, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 1.6123 | Val Loss: 1.5052


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.46it/s]


Epoch 1/10:
Train Loss: 2.2544 | Val Loss: 1.9433


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.37it/s]


Epoch 2/10:
Train Loss: 1.8235 | Val Loss: 1.6057


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.40it/s]


Epoch 3/10:
Train Loss: 1.5639 | Val Loss: 1.3911


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.30it/s]


Epoch 4/10:
Train Loss: 1.3808 | Val Loss: 1.2424


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.31it/s]


Epoch 5/10:
Train Loss: 1.2263 | Val Loss: 1.0907


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.52it/s]


Epoch 6/10:
Train Loss: 1.0765 | Val Loss: 0.9488


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.39it/s]


Epoch 7/10:
Train Loss: 0.9411 | Val Loss: 0.8345


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.36it/s]


Epoch 8/10:
Train Loss: 0.8230 | Val Loss: 0.7239


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.36it/s]


Epoch 9/10:
Train Loss: 0.7183 | Val Loss: 0.6564


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.47it/s]
[I 2025-05-10 01:54:56,787] Trial 68 finished with value: 0.5871323252481128 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10625404643138228, 'learning_rate': 0.0002549683699944533, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.6330 | Val Loss: 0.5871


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.53it/s]


Epoch 1/10:
Train Loss: 2.3094 | Val Loss: 1.9914


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.53it/s]


Epoch 2/10:
Train Loss: 1.9299 | Val Loss: 1.7766


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.58it/s]


Epoch 3/10:
Train Loss: 1.7353 | Val Loss: 1.6410


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.55it/s]


Epoch 4/10:
Train Loss: 1.5863 | Val Loss: 1.4836


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.49it/s]


Epoch 5/10:
Train Loss: 1.4666 | Val Loss: 1.4097


Training: 100%|██████████| 250/250 [00:41<00:00,  6.04it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.47it/s]


Epoch 6/10:
Train Loss: 1.3691 | Val Loss: 1.2967


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.51it/s]


Epoch 7/10:
Train Loss: 1.2785 | Val Loss: 1.2168


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.54it/s]


Epoch 8/10:
Train Loss: 1.1950 | Val Loss: 1.1587


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.45it/s]


Epoch 9/10:
Train Loss: 1.1124 | Val Loss: 1.0747


Training: 100%|██████████| 250/250 [00:41<00:00,  6.05it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.54it/s]
[I 2025-05-10 02:02:28,464] Trial 69 finished with value: 0.9998576915453351 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.11002034347868365, 'learning_rate': 0.00017683578250828774, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 1.0335 | Val Loss: 0.9999


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.46it/s]


Epoch 1/10:
Train Loss: 2.2317 | Val Loss: 1.9292


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.35it/s]


Epoch 2/10:
Train Loss: 1.8010 | Val Loss: 1.5981


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.37it/s]


Epoch 3/10:
Train Loss: 1.5714 | Val Loss: 1.4092


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.31it/s]


Epoch 4/10:
Train Loss: 1.3815 | Val Loss: 1.2470


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.40it/s]


Epoch 5/10:
Train Loss: 1.2266 | Val Loss: 1.0743


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.44it/s]


Epoch 6/10:
Train Loss: 1.0821 | Val Loss: 0.9480


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.41it/s]


Epoch 7/10:
Train Loss: 0.9483 | Val Loss: 0.8432


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.36it/s]


Epoch 8/10:
Train Loss: 0.8253 | Val Loss: 0.7342


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.50it/s]


Epoch 9/10:
Train Loss: 0.7197 | Val Loss: 0.6435


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.38it/s]
[I 2025-05-10 02:07:02,364] Trial 70 finished with value: 0.5680981193270002 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12284875618469387, 'learning_rate': 0.0003052282855478505, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.6318 | Val Loss: 0.5681


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.49it/s]


Epoch 1/10:
Train Loss: 2.2390 | Val Loss: 1.9133


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.59it/s]


Epoch 2/10:
Train Loss: 1.8015 | Val Loss: 1.5908


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.44it/s]


Epoch 3/10:
Train Loss: 1.5571 | Val Loss: 1.3791


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.42it/s]


Epoch 4/10:
Train Loss: 1.3710 | Val Loss: 1.2023


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.49it/s]


Epoch 5/10:
Train Loss: 1.2162 | Val Loss: 1.0709


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.50it/s]


Epoch 6/10:
Train Loss: 1.0662 | Val Loss: 0.9144


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.47it/s]


Epoch 7/10:
Train Loss: 0.9278 | Val Loss: 0.7972


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.47it/s]


Epoch 8/10:
Train Loss: 0.8010 | Val Loss: 0.6730


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.45it/s]


Epoch 9/10:
Train Loss: 0.6897 | Val Loss: 0.5866


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.39it/s]
[I 2025-05-10 02:11:36,214] Trial 71 finished with value: 0.516789669555331 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.13717431738406338, 'learning_rate': 0.0003546171946649882, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.5954 | Val Loss: 0.5168


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.23it/s]


Epoch 1/10:
Train Loss: 2.2419 | Val Loss: 1.9381


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.95it/s]


Epoch 2/10:
Train Loss: 1.8116 | Val Loss: 1.6191


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 3/10:
Train Loss: 1.5719 | Val Loss: 1.4164


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]


Epoch 4/10:
Train Loss: 1.3895 | Val Loss: 1.2511


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.27it/s]


Epoch 5/10:
Train Loss: 1.2380 | Val Loss: 1.1117


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.16it/s]


Epoch 6/10:
Train Loss: 1.0933 | Val Loss: 0.9590


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 7/10:
Train Loss: 0.9568 | Val Loss: 0.8172


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.02it/s]


Epoch 8/10:
Train Loss: 0.8254 | Val Loss: 0.7350


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.27it/s]


Epoch 9/10:
Train Loss: 0.7220 | Val Loss: 0.6305


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]
[I 2025-05-10 02:16:10,389] Trial 72 finished with value: 0.5674447795701405 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1324177409454542, 'learning_rate': 0.00035075169449191217, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.6285 | Val Loss: 0.5674


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.92it/s]


Epoch 1/10:
Train Loss: 2.2737 | Val Loss: 1.9675


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.93it/s]


Epoch 2/10:
Train Loss: 1.8589 | Val Loss: 1.6766


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.01it/s]


Epoch 3/10:
Train Loss: 1.6310 | Val Loss: 1.4640


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.03it/s]


Epoch 4/10:
Train Loss: 1.4552 | Val Loss: 1.3026


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.95it/s]


Epoch 5/10:
Train Loss: 1.3032 | Val Loss: 1.1504


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.91it/s]


Epoch 6/10:
Train Loss: 1.1597 | Val Loss: 1.0231


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.93it/s]


Epoch 7/10:
Train Loss: 1.0262 | Val Loss: 0.9015


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.10it/s]


Epoch 8/10:
Train Loss: 0.9055 | Val Loss: 0.7909


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.28it/s]


Epoch 9/10:
Train Loss: 0.8006 | Val Loss: 0.6941


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]
[I 2025-05-10 02:20:44,684] Trial 73 finished with value: 0.6420462448445577 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12280757287391297, 'learning_rate': 0.00026384678482180713, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.7046 | Val Loss: 0.6420


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.97it/s]


Epoch 1/10:
Train Loss: 2.2218 | Val Loss: 1.9192


Training: 100%|██████████| 250/250 [00:25<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.93it/s]


Epoch 2/10:
Train Loss: 1.7676 | Val Loss: 1.5858


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.05it/s]


Epoch 3/10:
Train Loss: 1.5087 | Val Loss: 1.3447


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.22it/s]


Epoch 4/10:
Train Loss: 1.3058 | Val Loss: 1.1769


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.04it/s]


Epoch 5/10:
Train Loss: 1.1233 | Val Loss: 0.9771


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 6/10:
Train Loss: 0.9477 | Val Loss: 0.8208


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.96it/s]


Epoch 7/10:
Train Loss: 0.7933 | Val Loss: 0.6787


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.21it/s]


Epoch 8/10:
Train Loss: 0.6599 | Val Loss: 0.5817


Training: 100%|██████████| 250/250 [00:25<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.16it/s]


Epoch 9/10:
Train Loss: 0.5544 | Val Loss: 0.4962


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]
[I 2025-05-10 02:25:18,953] Trial 74 finished with value: 0.45369571494677713 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10182610258127783, 'learning_rate': 0.0003892059426367067, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.4757 | Val Loss: 0.4537


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 1/10:
Train Loss: 2.2285 | Val Loss: 1.9360


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.26it/s]


Epoch 2/10:
Train Loss: 1.7829 | Val Loss: 1.6004


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]


Epoch 3/10:
Train Loss: 1.5464 | Val Loss: 1.4237


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.33it/s]


Epoch 4/10:
Train Loss: 1.3778 | Val Loss: 1.3079


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.32it/s]


Epoch 5/10:
Train Loss: 1.2402 | Val Loss: 1.2078


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.31it/s]


Epoch 6/10:
Train Loss: 1.1174 | Val Loss: 1.0619


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 7/10:
Train Loss: 0.9939 | Val Loss: 0.9418


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.27it/s]


Epoch 8/10:
Train Loss: 0.8586 | Val Loss: 0.8192


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.31it/s]


Epoch 9/10:
Train Loss: 0.7330 | Val Loss: 0.7192


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]
[I 2025-05-10 02:34:59,704] Trial 75 finished with value: 0.6176876175025153 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10195061508550907, 'learning_rate': 0.00040493726945829315, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.6186 | Val Loss: 0.6177


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.21it/s]


Epoch 1/10:
Train Loss: 2.2650 | Val Loss: 1.9100


Training: 100%|██████████| 250/250 [00:25<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.35it/s]


Epoch 2/10:
Train Loss: 1.8276 | Val Loss: 1.7080


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.41it/s]


Epoch 3/10:
Train Loss: 1.6703 | Val Loss: 1.5855


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.22it/s]


Epoch 4/10:
Train Loss: 1.5476 | Val Loss: 1.4552


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]


Epoch 5/10:
Train Loss: 1.4476 | Val Loss: 1.3920


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.38it/s]


Epoch 6/10:
Train Loss: 1.3492 | Val Loss: 1.2921


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.37it/s]


Epoch 7/10:
Train Loss: 1.2695 | Val Loss: 1.2381


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.23it/s]


Epoch 8/10:
Train Loss: 1.2010 | Val Loss: 1.1755


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.36it/s]


Epoch 9/10:
Train Loss: 1.1327 | Val Loss: 1.1403


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]
[I 2025-05-10 02:39:33,737] Trial 76 finished with value: 1.0640001287536016 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1117715157618645, 'learning_rate': 0.0005937773122788274, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 1.0732 | Val Loss: 1.0640


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.20it/s]


Epoch 1/10:
Train Loss: 2.2295 | Val Loss: 1.9255


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 2/10:
Train Loss: 1.7743 | Val Loss: 1.6245


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 3/10:
Train Loss: 1.5174 | Val Loss: 1.3579


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.10it/s]


Epoch 4/10:
Train Loss: 1.3248 | Val Loss: 1.1710


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]


Epoch 5/10:
Train Loss: 1.1597 | Val Loss: 1.0030


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.02it/s]


Epoch 6/10:
Train Loss: 1.0039 | Val Loss: 0.8675


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.18it/s]


Epoch 7/10:
Train Loss: 0.8646 | Val Loss: 0.7570


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 8/10:
Train Loss: 0.7509 | Val Loss: 0.6643


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.95it/s]


Epoch 9/10:
Train Loss: 0.6520 | Val Loss: 0.5860


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.84it/s]
[I 2025-05-10 02:44:07,940] Trial 77 finished with value: 0.5181988718963805 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12037660877980483, 'learning_rate': 0.0003252176521292491, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.5668 | Val Loss: 0.5182


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.18it/s]


Epoch 1/10:
Train Loss: 2.2999 | Val Loss: 1.9843


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.30it/s]


Epoch 2/10:
Train Loss: 1.8979 | Val Loss: 1.6867


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.01it/s]


Epoch 3/10:
Train Loss: 1.6768 | Val Loss: 1.5012


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.97it/s]


Epoch 4/10:
Train Loss: 1.5185 | Val Loss: 1.3989


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.08it/s]


Epoch 5/10:
Train Loss: 1.3989 | Val Loss: 1.2591


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.92it/s]


Epoch 6/10:
Train Loss: 1.2924 | Val Loss: 1.1650


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.91it/s]


Epoch 7/10:
Train Loss: 1.1937 | Val Loss: 1.0750


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 8/10:
Train Loss: 1.0954 | Val Loss: 0.9716


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.87it/s]


Epoch 9/10:
Train Loss: 0.9968 | Val Loss: 0.8734


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.90it/s]
[I 2025-05-10 02:48:42,224] Trial 78 finished with value: 0.7957694123661707 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.14401522988268517, 'learning_rate': 0.00020594025731647279, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 0.9081 | Val Loss: 0.7958


Training: 100%|██████████| 250/250 [00:49<00:00,  5.05it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.52it/s]


Epoch 1/10:
Train Loss: 2.3056 | Val Loss: 1.9595


Training: 100%|██████████| 250/250 [00:50<00:00,  4.99it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.51it/s]


Epoch 2/10:
Train Loss: 1.9019 | Val Loss: 1.7605


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.50it/s]


Epoch 3/10:
Train Loss: 1.7431 | Val Loss: 1.6689


Training: 100%|██████████| 250/250 [00:50<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.53it/s]


Epoch 4/10:
Train Loss: 1.6339 | Val Loss: 1.5679


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.50it/s]


Epoch 5/10:
Train Loss: 1.5421 | Val Loss: 1.4767


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.50it/s]


Epoch 6/10:
Train Loss: 1.4549 | Val Loss: 1.4158


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.52it/s]


Epoch 7/10:
Train Loss: 1.3776 | Val Loss: 1.3757


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.51it/s]


Epoch 8/10:
Train Loss: 1.3036 | Val Loss: 1.3439


Training: 100%|██████████| 250/250 [00:50<00:00,  4.99it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.48it/s]


Epoch 9/10:
Train Loss: 1.2424 | Val Loss: 1.2570


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.50it/s]
[I 2025-05-10 02:57:48,710] Trial 79 finished with value: 1.1843971212704976 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.13040354533782766, 'learning_rate': 0.00024431753396661834, 'batch_size': 32}. Best is trial 55 with value: 0.45232788295972914.


Epoch 10/10:
Train Loss: 1.1799 | Val Loss: 1.1844


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.22it/s]


Epoch 1/10:
Train Loss: 2.2217 | Val Loss: 1.8808


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.33it/s]


Epoch 2/10:
Train Loss: 1.7926 | Val Loss: 1.6219


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.17it/s]


Epoch 3/10:
Train Loss: 1.5483 | Val Loss: 1.3933


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.33it/s]


Epoch 4/10:
Train Loss: 1.3359 | Val Loss: 1.1845


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.33it/s]


Epoch 5/10:
Train Loss: 1.1534 | Val Loss: 1.0288


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.93it/s]


Epoch 6/10:
Train Loss: 0.9820 | Val Loss: 0.8533


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.09it/s]


Epoch 7/10:
Train Loss: 0.8188 | Val Loss: 0.7046


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.91it/s]


Epoch 8/10:
Train Loss: 0.6842 | Val Loss: 0.5848


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 9/10:
Train Loss: 0.5647 | Val Loss: 0.4965


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]
[I 2025-05-10 03:02:23,014] Trial 80 finished with value: 0.43736634131461855 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10213470498440091, 'learning_rate': 0.0004286912950025756, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.4735 | Val Loss: 0.4374
New best model found! Val Loss: 0.4374
Config: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10213470498440091, 'learning_rate': 0.0004286912950025756, 'batch_size': 32}


Training: 100%|██████████| 250/250 [00:24<00:00, 10.03it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]


Epoch 1/10:
Train Loss: 2.2244 | Val Loss: 1.9136


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.16it/s]


Epoch 2/10:
Train Loss: 1.8145 | Val Loss: 1.6647


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.09it/s]


Epoch 3/10:
Train Loss: 1.6172 | Val Loss: 1.5151


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.33it/s]


Epoch 4/10:
Train Loss: 1.4672 | Val Loss: 1.3786


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 5/10:
Train Loss: 1.3397 | Val Loss: 1.2616


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.18it/s]


Epoch 6/10:
Train Loss: 1.2178 | Val Loss: 1.1187


Training: 100%|██████████| 250/250 [00:25<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.27it/s]


Epoch 7/10:
Train Loss: 1.0858 | Val Loss: 0.9925


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 8/10:
Train Loss: 0.9591 | Val Loss: 0.8710


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 9/10:
Train Loss: 0.8303 | Val Loss: 0.7559


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.27it/s]
[I 2025-05-10 03:06:57,095] Trial 81 finished with value: 0.6455482277605269 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10194794112968582, 'learning_rate': 0.0004486751932944811, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.7161 | Val Loss: 0.6455


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.07it/s]


Epoch 1/10:
Train Loss: 2.2441 | Val Loss: 1.9314


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.73it/s]


Epoch 2/10:
Train Loss: 1.8020 | Val Loss: 1.6266


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.67it/s]


Epoch 3/10:
Train Loss: 1.5677 | Val Loss: 1.4203


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.73it/s]


Epoch 4/10:
Train Loss: 1.3814 | Val Loss: 1.2430


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 5/10:
Train Loss: 1.2273 | Val Loss: 1.0990


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.03it/s]


Epoch 6/10:
Train Loss: 1.0820 | Val Loss: 0.9575


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.92it/s]


Epoch 7/10:
Train Loss: 0.9407 | Val Loss: 0.8273


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.98it/s]


Epoch 8/10:
Train Loss: 0.8186 | Val Loss: 0.7191


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.17it/s]


Epoch 9/10:
Train Loss: 0.7118 | Val Loss: 0.6371


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.91it/s]
[I 2025-05-10 03:11:31,482] Trial 82 finished with value: 0.5749565187900786 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.11103261278574236, 'learning_rate': 0.0002871519746047061, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.6263 | Val Loss: 0.5750


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.28it/s]


Epoch 1/10:
Train Loss: 2.2259 | Val Loss: 1.8989


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 2/10:
Train Loss: 1.7692 | Val Loss: 1.5730


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.01it/s]


Epoch 3/10:
Train Loss: 1.5195 | Val Loss: 1.3446


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.91it/s]


Epoch 4/10:
Train Loss: 1.3259 | Val Loss: 1.1989


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]


Epoch 5/10:
Train Loss: 1.1539 | Val Loss: 1.0049


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 6/10:
Train Loss: 0.9879 | Val Loss: 0.8201


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]


Epoch 7/10:
Train Loss: 0.8287 | Val Loss: 0.7119


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.71it/s]


Epoch 8/10:
Train Loss: 0.6955 | Val Loss: 0.5973


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.93it/s]


Epoch 9/10:
Train Loss: 0.5867 | Val Loss: 0.5160


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]
[I 2025-05-10 03:16:05,755] Trial 83 finished with value: 0.4655762981800806 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.120673873846262, 'learning_rate': 0.0003912887996458808, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.5078 | Val Loss: 0.4656


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.55it/s]


Epoch 1/10:
Train Loss: 2.2822 | Val Loss: 1.9303


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.46it/s]


Epoch 2/10:
Train Loss: 1.8878 | Val Loss: 1.7730


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.48it/s]


Epoch 3/10:
Train Loss: 1.7548 | Val Loss: 1.6713


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.55it/s]


Epoch 4/10:
Train Loss: 1.6686 | Val Loss: 1.6059


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.54it/s]


Epoch 5/10:
Train Loss: 1.5991 | Val Loss: 1.5594


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.53it/s]


Epoch 6/10:
Train Loss: 1.5498 | Val Loss: 1.5279


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.55it/s]


Epoch 7/10:
Train Loss: 1.5077 | Val Loss: 1.4881


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.59it/s]


Epoch 8/10:
Train Loss: 1.4699 | Val Loss: 1.4699


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.59it/s]


Epoch 9/10:
Train Loss: 1.4369 | Val Loss: 1.4427


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.58it/s]
[I 2025-05-10 03:20:39,561] Trial 84 finished with value: 1.4255255366128587 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12018500840640034, 'learning_rate': 0.0006842283432323402, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 1.4055 | Val Loss: 1.4255


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.10it/s]


Epoch 1/10:
Train Loss: 2.2523 | Val Loss: 1.9570


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 2/10:
Train Loss: 1.8743 | Val Loss: 1.7383


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch 3/10:
Train Loss: 1.6858 | Val Loss: 1.5521


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.33it/s]


Epoch 4/10:
Train Loss: 1.5281 | Val Loss: 1.4086


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.17it/s]


Epoch 5/10:
Train Loss: 1.3946 | Val Loss: 1.2869


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.32it/s]


Epoch 6/10:
Train Loss: 1.2778 | Val Loss: 1.2261


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch 7/10:
Train Loss: 1.1661 | Val Loss: 1.0768


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 8/10:
Train Loss: 1.0508 | Val Loss: 0.9433


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.30it/s]


Epoch 9/10:
Train Loss: 0.9308 | Val Loss: 0.8197


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]
[I 2025-05-10 03:25:13,717] Trial 85 finished with value: 0.7106091550418309 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1463150860605231, 'learning_rate': 0.0004241896319381534, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.8137 | Val Loss: 0.7106


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]


Epoch 1/10:
Train Loss: 2.2306 | Val Loss: 1.9077


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.29it/s]


Epoch 2/10:
Train Loss: 1.8137 | Val Loss: 1.6737


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 3/10:
Train Loss: 1.6446 | Val Loss: 1.5655


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.31it/s]


Epoch 4/10:
Train Loss: 1.5063 | Val Loss: 1.4463


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.33it/s]


Epoch 5/10:
Train Loss: 1.3946 | Val Loss: 1.3770


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.31it/s]


Epoch 6/10:
Train Loss: 1.2999 | Val Loss: 1.2904


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.31it/s]


Epoch 7/10:
Train Loss: 1.2138 | Val Loss: 1.1823


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.28it/s]


Epoch 8/10:
Train Loss: 1.1338 | Val Loss: 1.1390


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.30it/s]


Epoch 9/10:
Train Loss: 1.0617 | Val Loss: 1.0923


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.25it/s]
[I 2025-05-10 03:34:54,094] Trial 86 finished with value: 1.0146043130329676 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12646198085702115, 'learning_rate': 0.0005796523821265069, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.9926 | Val Loss: 1.0146


Training: 100%|██████████| 250/250 [00:15<00:00, 16.56it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.67it/s]


Epoch 1/10:
Train Loss: 3.0048 | Val Loss: 2.9814


Training: 100%|██████████| 250/250 [00:15<00:00, 16.54it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.41it/s]


Epoch 2/10:
Train Loss: 2.9787 | Val Loss: 2.9766


Training: 100%|██████████| 250/250 [00:15<00:00, 16.53it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.45it/s]


Epoch 3/10:
Train Loss: 2.9747 | Val Loss: 3.0158


Training: 100%|██████████| 250/250 [00:15<00:00, 16.56it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.69it/s]


Epoch 4/10:
Train Loss: 2.9722 | Val Loss: 3.1066


Training: 100%|██████████| 250/250 [00:15<00:00, 16.57it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 44.81it/s]
[I 2025-05-10 03:36:16,768] Trial 87 finished with value: 2.976568411266993 and parameters: {'d_model': 128, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10088585979933135, 'learning_rate': 0.004340229477103834, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 5/10:
Train Loss: 2.9722 | Val Loss: 2.9942
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.95it/s]


Epoch 1/10:
Train Loss: 3.0089 | Val Loss: 2.9795


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.97it/s]


Epoch 2/10:
Train Loss: 2.9792 | Val Loss: 2.9761


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.12it/s]


Epoch 3/10:
Train Loss: 2.9769 | Val Loss: 2.9742


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.05it/s]


Epoch 4/10:
Train Loss: 2.9765 | Val Loss: 2.9741


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.01it/s]


Epoch 5/10:
Train Loss: 2.9755 | Val Loss: 2.9757


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.08it/s]


Epoch 6/10:
Train Loss: 2.9748 | Val Loss: 2.9745


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 27.04it/s]
[I 2025-05-10 03:39:28,139] Trial 88 finished with value: 2.974068861159067 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10790865316332127, 'learning_rate': 0.0033688030578034986, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 7/10:
Train Loss: 2.9683 | Val Loss: 3.1315
Early stopping triggered!


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 1/10:
Train Loss: 2.2381 | Val Loss: 1.9426


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.09it/s]


Epoch 2/10:
Train Loss: 1.7998 | Val Loss: 1.6674


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 3/10:
Train Loss: 1.5721 | Val Loss: 1.4125


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 4/10:
Train Loss: 1.3929 | Val Loss: 1.2562


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.98it/s]


Epoch 5/10:
Train Loss: 1.2372 | Val Loss: 1.1089


Training: 100%|██████████| 250/250 [00:25<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch 6/10:
Train Loss: 1.0942 | Val Loss: 0.9517


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 7/10:
Train Loss: 0.9484 | Val Loss: 0.8265


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.00it/s]


Epoch 8/10:
Train Loss: 0.8134 | Val Loss: 0.6993


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.94it/s]


Epoch 9/10:
Train Loss: 0.7032 | Val Loss: 0.6087


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.23it/s]
[I 2025-05-10 03:44:02,365] Trial 89 finished with value: 0.5353782474994659 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.11700975729854926, 'learning_rate': 0.00032373527511693847, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.6058 | Val Loss: 0.5354


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.54it/s]


Epoch 1/10:
Train Loss: 2.5489 | Val Loss: 2.1411


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.58it/s]


Epoch 2/10:
Train Loss: 2.0862 | Val Loss: 1.9362


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.58it/s]


Epoch 3/10:
Train Loss: 1.9384 | Val Loss: 1.8267


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.69it/s]


Epoch 4/10:
Train Loss: 1.8523 | Val Loss: 1.7485


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.66it/s]


Epoch 5/10:
Train Loss: 1.7901 | Val Loss: 1.7030


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.61it/s]


Epoch 6/10:
Train Loss: 1.7392 | Val Loss: 1.6570


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.63it/s]


Epoch 7/10:
Train Loss: 1.6975 | Val Loss: 1.6208


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.75it/s]


Epoch 8/10:
Train Loss: 1.6649 | Val Loss: 1.5994


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.72it/s]


Epoch 9/10:
Train Loss: 1.6335 | Val Loss: 1.5894


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.78it/s]
[I 2025-05-10 03:48:36,065] Trial 90 finished with value: 1.5570992307057456 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.34016614302817877, 'learning_rate': 0.0008571190583594482, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 1.6056 | Val Loss: 1.5571


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.99it/s]


Epoch 1/10:
Train Loss: 2.2325 | Val Loss: 1.8876


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]


Epoch 2/10:
Train Loss: 1.7921 | Val Loss: 1.6012


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.21it/s]


Epoch 3/10:
Train Loss: 1.5557 | Val Loss: 1.3923


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.06it/s]


Epoch 4/10:
Train Loss: 1.3724 | Val Loss: 1.2129


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.11it/s]


Epoch 5/10:
Train Loss: 1.2076 | Val Loss: 1.0608


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.91it/s]


Epoch 6/10:
Train Loss: 1.0535 | Val Loss: 0.8996


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 7/10:
Train Loss: 0.9057 | Val Loss: 0.7609


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.20it/s]


Epoch 8/10:
Train Loss: 0.7823 | Val Loss: 0.6752


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.18it/s]


Epoch 9/10:
Train Loss: 0.6774 | Val Loss: 0.5833


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.70it/s]
[I 2025-05-10 03:53:10,316] Trial 91 finished with value: 0.5443551195046258 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.13431717713836616, 'learning_rate': 0.0003781069109119953, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.5888 | Val Loss: 0.5444


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.27it/s]


Epoch 1/10:
Train Loss: 2.2389 | Val Loss: 1.9587


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.34it/s]


Epoch 2/10:
Train Loss: 1.8364 | Val Loss: 1.6735


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.30it/s]


Epoch 3/10:
Train Loss: 1.6339 | Val Loss: 1.5150


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch 4/10:
Train Loss: 1.4701 | Val Loss: 1.3277


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 5/10:
Train Loss: 1.3255 | Val Loss: 1.1795


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.33it/s]


Epoch 6/10:
Train Loss: 1.1891 | Val Loss: 1.0487


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 7/10:
Train Loss: 1.0570 | Val Loss: 0.9179


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.31it/s]


Epoch 8/10:
Train Loss: 0.9253 | Val Loss: 0.7807


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]


Epoch 9/10:
Train Loss: 0.7966 | Val Loss: 0.6802


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.14it/s]
[I 2025-05-10 03:57:44,428] Trial 92 finished with value: 0.5620719998601883 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.153746406109977, 'learning_rate': 0.00042100208471195736, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.6833 | Val Loss: 0.5621


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.05it/s]


Epoch 1/10:
Train Loss: 2.2384 | Val Loss: 1.8978


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.06it/s]


Epoch 2/10:
Train Loss: 1.8132 | Val Loss: 1.6357


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.88it/s]


Epoch 3/10:
Train Loss: 1.5708 | Val Loss: 1.4115


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.00it/s]


Epoch 4/10:
Train Loss: 1.3833 | Val Loss: 1.2243


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.12it/s]


Epoch 5/10:
Train Loss: 1.2360 | Val Loss: 1.0835


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.27it/s]


Epoch 6/10:
Train Loss: 1.0936 | Val Loss: 0.9554


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 7/10:
Train Loss: 0.9596 | Val Loss: 0.8227


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.94it/s]


Epoch 8/10:
Train Loss: 0.8404 | Val Loss: 0.7380


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.03it/s]


Epoch 9/10:
Train Loss: 0.7366 | Val Loss: 0.6609


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.05it/s]
[I 2025-05-10 04:02:18,667] Trial 93 finished with value: 0.5863720093454633 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12454438755277189, 'learning_rate': 0.00029127404282230744, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.6481 | Val Loss: 0.5864


Training: 100%|██████████| 250/250 [00:41<00:00,  6.08it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.41it/s]


Epoch 1/10:
Train Loss: 2.3634 | Val Loss: 1.9811


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.41it/s]


Epoch 2/10:
Train Loss: 1.8620 | Val Loss: 1.7163


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.43it/s]


Epoch 3/10:
Train Loss: 1.6969 | Val Loss: 1.6162


Training: 100%|██████████| 250/250 [00:41<00:00,  6.01it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.37it/s]


Epoch 4/10:
Train Loss: 1.5891 | Val Loss: 1.5439


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.30it/s]


Epoch 5/10:
Train Loss: 1.4912 | Val Loss: 1.4260


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.37it/s]


Epoch 6/10:
Train Loss: 1.4040 | Val Loss: 1.3933


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.43it/s]


Epoch 7/10:
Train Loss: 1.3347 | Val Loss: 1.3175


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.41it/s]


Epoch 8/10:
Train Loss: 1.2680 | Val Loss: 1.2941


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.39it/s]


Epoch 9/10:
Train Loss: 1.2012 | Val Loss: 1.2181


Training: 100%|██████████| 250/250 [00:41<00:00,  6.02it/s]
Evaluating: 100%|██████████| 63/63 [00:03<00:00, 16.42it/s]
[I 2025-05-10 04:09:52,219] Trial 94 finished with value: 1.156771891646915 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 10, 'd_ff': 512, 'dropout': 0.10881508367388902, 'learning_rate': 0.00037847562201127194, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 1.1457 | Val Loss: 1.1568


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.38it/s]


Epoch 1/10:
Train Loss: 2.2328 | Val Loss: 1.8936


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.15it/s]


Epoch 2/10:
Train Loss: 1.8366 | Val Loss: 1.7197


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.29it/s]


Epoch 3/10:
Train Loss: 1.6595 | Val Loss: 1.5628


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.41it/s]


Epoch 4/10:
Train Loss: 1.5081 | Val Loss: 1.4052


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.31it/s]


Epoch 5/10:
Train Loss: 1.3717 | Val Loss: 1.2592


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.13it/s]


Epoch 6/10:
Train Loss: 1.2522 | Val Loss: 1.1422


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.28it/s]


Epoch 7/10:
Train Loss: 1.1295 | Val Loss: 1.0290


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.35it/s]


Epoch 8/10:
Train Loss: 0.9982 | Val Loss: 0.8948


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.32it/s]


Epoch 9/10:
Train Loss: 0.8670 | Val Loss: 0.7536


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.81it/s]
[I 2025-05-10 04:14:26,316] Trial 95 finished with value: 0.6200034769754561 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.1412108375997173, 'learning_rate': 0.0004947569104762764, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.7391 | Val Loss: 0.6200


Training: 100%|██████████| 250/250 [00:30<00:00,  8.20it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.89it/s]


Epoch 1/10:
Train Loss: 2.2238 | Val Loss: 1.9246


Training: 100%|██████████| 250/250 [00:30<00:00,  8.22it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.86it/s]


Epoch 2/10:
Train Loss: 1.8444 | Val Loss: 1.6876


Training: 100%|██████████| 250/250 [00:30<00:00,  8.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.87it/s]


Epoch 3/10:
Train Loss: 1.6621 | Val Loss: 1.5734


Training: 100%|██████████| 250/250 [00:30<00:00,  8.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.97it/s]


Epoch 4/10:
Train Loss: 1.5239 | Val Loss: 1.4266


Training: 100%|██████████| 250/250 [00:30<00:00,  8.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.92it/s]


Epoch 5/10:
Train Loss: 1.3997 | Val Loss: 1.3091


Training: 100%|██████████| 250/250 [00:30<00:00,  8.24it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 22.01it/s]


Epoch 6/10:
Train Loss: 1.2816 | Val Loss: 1.2062


Training: 100%|██████████| 250/250 [00:30<00:00,  8.24it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.96it/s]


Epoch 7/10:
Train Loss: 1.1791 | Val Loss: 1.1074


Training: 100%|██████████| 250/250 [00:30<00:00,  8.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.90it/s]


Epoch 8/10:
Train Loss: 1.0747 | Val Loss: 1.0160


Training: 100%|██████████| 250/250 [00:30<00:00,  8.24it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.93it/s]


Epoch 9/10:
Train Loss: 0.9674 | Val Loss: 0.9080


Training: 100%|██████████| 250/250 [00:30<00:00,  8.23it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 21.93it/s]
[I 2025-05-10 04:19:58,998] Trial 96 finished with value: 0.8033950243677411 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 1024, 'dropout': 0.11728377552732709, 'learning_rate': 0.00033839543314254593, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.8601 | Val Loss: 0.8034


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.00it/s]


Epoch 1/10:
Train Loss: 2.2565 | Val Loss: 1.9506


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.99it/s]


Epoch 2/10:
Train Loss: 1.8344 | Val Loss: 1.6836


Training: 100%|██████████| 250/250 [00:24<00:00, 10.01it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.96it/s]


Epoch 3/10:
Train Loss: 1.6031 | Val Loss: 1.4332


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.93it/s]


Epoch 4/10:
Train Loss: 1.4193 | Val Loss: 1.2823


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.08it/s]


Epoch 5/10:
Train Loss: 1.2665 | Val Loss: 1.1227


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.18it/s]


Epoch 6/10:
Train Loss: 1.1254 | Val Loss: 0.9757


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.93it/s]


Epoch 7/10:
Train Loss: 0.9956 | Val Loss: 0.8591


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.71it/s]


Epoch 8/10:
Train Loss: 0.8760 | Val Loss: 0.7576


Training: 100%|██████████| 250/250 [00:24<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 25.96it/s]


Epoch 9/10:
Train Loss: 0.7719 | Val Loss: 0.6729


Training: 100%|██████████| 250/250 [00:25<00:00, 10.00it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.17it/s]
[I 2025-05-10 04:24:33,329] Trial 97 finished with value: 0.5889026082697368 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.12812233633747938, 'learning_rate': 0.00026746198959596315, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 0.6767 | Val Loss: 0.5889


Training: 100%|██████████| 250/250 [00:50<00:00,  4.99it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.51it/s]


Epoch 1/10:
Train Loss: 2.2917 | Val Loss: 1.9599


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.54it/s]


Epoch 2/10:
Train Loss: 1.8986 | Val Loss: 1.7697


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.51it/s]


Epoch 3/10:
Train Loss: 1.7368 | Val Loss: 1.6675


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.55it/s]


Epoch 4/10:
Train Loss: 1.6311 | Val Loss: 1.5584


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.52it/s]


Epoch 5/10:
Train Loss: 1.5248 | Val Loss: 1.4848


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.51it/s]


Epoch 6/10:
Train Loss: 1.4322 | Val Loss: 1.3913


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.53it/s]


Epoch 7/10:
Train Loss: 1.3448 | Val Loss: 1.3369


Training: 100%|██████████| 250/250 [00:50<00:00,  4.99it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.52it/s]


Epoch 8/10:
Train Loss: 1.2627 | Val Loss: 1.2494


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.52it/s]


Epoch 9/10:
Train Loss: 1.1925 | Val Loss: 1.2045


Training: 100%|██████████| 250/250 [00:49<00:00,  5.00it/s]
Evaluating: 100%|██████████| 63/63 [00:04<00:00, 13.51it/s]
[I 2025-05-10 04:33:40,274] Trial 98 finished with value: 1.153035281196473 and parameters: {'d_model': 256, 'num_heads': 2, 'num_layers': 12, 'd_ff': 512, 'dropout': 0.10025139881482764, 'learning_rate': 0.00022350676836761096, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 10/10:
Train Loss: 1.1227 | Val Loss: 1.1530


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.31it/s]


Epoch 1/10:
Train Loss: 2.2782 | Val Loss: 1.9201


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.35it/s]


Epoch 2/10:
Train Loss: 1.8504 | Val Loss: 1.7399


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.38it/s]


Epoch 3/10:
Train Loss: 1.7115 | Val Loss: 1.6387


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.38it/s]


Epoch 4/10:
Train Loss: 1.6183 | Val Loss: 1.5775


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.36it/s]


Epoch 5/10:
Train Loss: 1.5479 | Val Loss: 1.5170


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.37it/s]


Epoch 6/10:
Train Loss: 1.5168 | Val Loss: 1.5243


Training: 100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.37it/s]


Epoch 7/10:
Train Loss: 1.4561 | Val Loss: 1.8111


Training: 100%|██████████| 250/250 [00:52<00:00,  4.72it/s]
Evaluating: 100%|██████████| 63/63 [00:05<00:00, 12.39it/s]
[I 2025-05-10 04:41:24,508] Trial 99 finished with value: 1.5170053546390836 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.10952848071966063, 'learning_rate': 0.0006702670951989265, 'batch_size': 32}. Best is trial 80 with value: 0.43736634131461855.


Epoch 8/10:
Train Loss: 1.4089 | Val Loss: 2.4205
Early stopping triggered!

Best trial:
  Validation Loss: 0.4374
  Params: 
    d_model: 256
    num_heads: 2
    num_layers: 6
    d_ff: 512
    dropout: 0.10213470498440091
    learning_rate: 0.0004286912950025756
    batch_size: 32


Evaluating: 100%|██████████| 250/250 [00:09<00:00, 26.07it/s]
Evaluating: 100%|██████████| 63/63 [00:02<00:00, 26.10it/s]



Final Evaluation:
Train Loss: 0.2783 | Val Loss: 0.4374
Train Accuracy: 0.9081 | Val Accuracy: 0.8586

Test Decryptions:
Input: 'Please decrypt the following using Caesar cipher: gfbs' | Output: 'THEREE HARKENT TOM NOTHESELF HERED NEVERS ABOUTER HOME.' | Expected: 'fear' | ✗
Input: 'Please decrypt the following using Caesar cipher: dpnqvufs' | Output: 'THEREE HARKENT TOM NOTHESELF HERED NEVERS ABOUTE THE STOP.' | Expected: 'computer' | ✗
Input: 'Please decrypt the following using Caesar cipher:xibu' | Output: 'THEREE HARKENT TOM NOTHESELF HERED NEVERS ABOUT MAKE.' | Expected: 'what' | ✗
