In [1]:
from typing import Optional
from torch import nn
from torch.nn import functional as F
import torch

class SpaceTranslator(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        dir_hidden_dims: list[int],
        scale_hidden_dims: list[int],
        activation=nn.GELU,
        dropout_rate: float=0.3
    ):
        super().__init__()

        def build_mlp(hidden_dims, out_dim, act, apply_softplus=False):
            layers = []
            last_dim = input_dim
            for hidden in hidden_dims:
                layers += [
                    nn.Linear(last_dim, hidden),
                    activation(),
                    nn.LayerNorm(hidden),
                    nn.Dropout(dropout_rate)
                ]
                last_dim = hidden
            layers.append(nn.Linear(last_dim, out_dim))
            
            if apply_softplus:
                layers.append(nn.Softplus())
            
            return nn.Sequential(*layers)

        self.dir_head = build_mlp(dir_hidden_dims, output_dim, activation, apply_softplus=False)
        self.scale_head = build_mlp(scale_hidden_dims, output_dim, activation, apply_softplus=True)

        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1/0.07))

        self.apply(self.init_weights)

    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0.0)
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, x):
        direction = self.dir_head(x)
        scale = self.scale_head(x)
        
        return F.normalize(direction, p=2, dim=-1) * scale
        
# class SpaceTranslator(nn.Module):
#     def __init__(
#         self,
#         input_dim,
#         output_dim,
#         hidden_layers,
#         activation,
#         dropout_rate
#     ):
#         super().__init__()

#         layers = []
#         last = input_dim

#         for hidden in hidden_layers:
#             layers += [
#                 nn.Linear(last, hidden),
#                 nn.LayerNorm(hidden),
#                 activation(),
#                 nn.Dropout(dropout_rate)
#             ]
#             last = hidden

#         layers.append(nn.Linear(last, output_dim))
#         self.net = nn.Sequential(*layers)

#         self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1/0.07))

#         self.apply(self.init_weights)

#     def init_weights(self, module):
#         if isinstance(module, nn.Linear):
#             nn.init.xavier_uniform_(module.weight)
#             if module.bias is not None:
#                 nn.init.constant_(module.bias, 0.0)
#         elif isinstance(module, nn.LayerNorm):
#             nn.init.ones_(module.weight)
#             nn.init.zeros_(module.bias)

#     def forward(self, x):
#       return F.normalize(self.net(x), p=2, dim=1)

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader


'''Code from https://github.com/Mamiglia/challenge'''

def mrr(pred_indices: np.ndarray, gt_indices: np.ndarray) -> float:
    """
    Compute Mean Reciprocal Rank (MRR)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries (top-K)
        gt_indices: (N,) array of ground truth indices
    Returns:
        mrr: Mean Reciprocal Rank
    """
    reciprocal_ranks = []
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i] == gt_indices[i])[0]
        if matches.size > 0:
            reciprocal_ranks.append(1.0 / (matches[0] + 1))
        else:
            reciprocal_ranks.append(0.0)
    return np.mean(reciprocal_ranks)


def recall_at_k(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int) -> float:
    """Compute Recall@k
    Args:
        pred_indices: (N, N) array of top indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        recall: Recall@k
    """
    recall = 0
    for i in range(len(gt_indices)):
        if gt_indices[i] in pred_indices[i, :k]:
            recall += 1
    recall /= len(gt_indices)
    return recall

import numpy as np

def ndcg(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int = 100) -> float:
    """
    Compute Normalized Discounted Cumulative Gain (NDCG@k)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        ndcg: NDCG@k
    """
    ndcg_total = 0.0
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i, :k] == gt_indices[i])[0]
        if matches.size > 0:
            rank = matches[0] + 1
            ndcg_total += 1.0 / np.log2(rank + 1)  # DCG (IDCG = 1)
    return ndcg_total / len(gt_indices)



@torch.inference_mode()
def evaluate_retrieval(translated_embd, image_embd, gt_indices, max_indices = 99, batch_size=100):
    """Evaluate retrieval performance using cosine similarity
    Args:
        translated_embd: (N_captions, D) translated caption embeddings
        image_embd: (N_images, D) image embeddings
        gt_indices: (N_captions,) ground truth image indices for each caption
        max_indices: number of top predictions to consider
    Returns:
        results: dict of evaluation metrics

    """
    # Compute similarity matrix
    if isinstance(translated_embd, np.ndarray):
        translated_embd = torch.from_numpy(translated_embd).float()
    if isinstance(image_embd, np.ndarray):
        image_embd = torch.from_numpy(image_embd).float()

    n_queries = translated_embd.shape[0]
    device = translated_embd.device

    # Prepare containers for the fragments to be reassembled
    all_sorted_indices = []
    l2_distances = []

    # Process in batches - the narrow gate approach
    for start_idx in range(0, n_queries, batch_size):
        batch_slice = slice(start_idx, min(start_idx + batch_size, n_queries))
        batch_translated = translated_embd[batch_slice]
        batch_img_embd = image_embd[batch_slice]

        # Compute similarity only for this batch
        batch_similarity = batch_translated @ batch_img_embd.T

        # Get top-k predictions for this batch
        batch_indices = batch_similarity.topk(k=max_indices, dim=1, sorted=True).indices.numpy()
        all_sorted_indices.append(gt_indices[batch_slice][batch_indices])

        # Compute L2 distance for this batch
        batch_gt = gt_indices[batch_slice]
        batch_gt_embeddings = image_embd[batch_gt]
        batch_l2 = (batch_translated - batch_gt_embeddings).norm(dim=1)
        l2_distances.append(batch_l2)

    # Reassemble the fragments
    sorted_indices = np.concatenate(all_sorted_indices, axis=0)

    # Apply the sacred metrics to the whole
    metrics = {
        'mrr': mrr,
        'ndcg': ndcg,
        'recall_at_1': lambda preds, gt: recall_at_k(preds, gt, 1),
        'recall_at_3': lambda preds, gt: recall_at_k(preds, gt, 3),
        'recall_at_5': lambda preds, gt: recall_at_k(preds, gt, 5),
        'recall_at_10': lambda preds, gt: recall_at_k(preds, gt, 10),
        'recall_at_50': lambda preds, gt: recall_at_k(preds, gt, 50),
    }

    results = {
        name: func(sorted_indices, gt_indices)
        for name, func in metrics.items()
    }

    l2_dist = torch.cat(l2_distances, dim=0).mean().item()
    results['l2_dist'] = l2_dist

    return results



def get_data(data_path: Path):
    data = np.load(data_path)
    caption_embeddings = data['captions/embeddings']
    image_embeddings = data['images/embeddings']
    caption_labels = data['captions/label']
    data.close()

    X_abs, y_abs = torch.tensor(caption_embeddings), torch.tensor(image_embeddings[np.argmax(caption_labels, axis=1)])

    return X_abs, y_abs

def get_datasets(X_abs, y_abs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    print('Texts shape', X_abs.shape)
    print('Images shape', y_abs.shape)

    dataset = TensorDataset(X_abs, y_abs)
    train_dataset, val_dataset = random_split(dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(42))

    return train_dataset, val_dataset

def test(val_dataset: TensorDataset, model: nn.Module, device):
    val_loader = DataLoader(val_dataset, batch_size=len(val_dataset))
    for x_val, y_val in val_loader:
        results = eval_on_val(x_val, y_val, model=model, device=device)
    return results

def eval_on_val(x_val: np.ndarray, y_val: np.ndarray, model: nn.Module, device) -> dict:
    gt_indices = torch.arange(len(y_val))

    model.eval()

    with torch.inference_mode():
        preds = model(x_val.to(device)).cpu()

    results = evaluate_retrieval(preds, y_val, gt_indices)

    return results


def generate_submission(model: nn.Module, test_path: Path, output_file="submission-dirmodel.csv", device=None):
    test_data = np.load(test_path)
    sample_ids = test_data['captions/ids']
    test_embds = test_data['captions/embeddings']
    test_embds = torch.from_numpy(test_embds).float()

    with torch.no_grad():
        pred_embds = model(test_embds.to(device)).cpu()

    print("Generating submission file...")

    if isinstance(pred_embds, torch.Tensor):
        pred_embds = pred_embds.cpu().numpy()

    df_submission = pd.DataFrame({'id': sample_ids, 'embedding': pred_embds.tolist()})

    df_submission.to_csv(output_file, index=False, float_format='%.17g')
    print(f"‚úì Saved submission to {output_file}")

    return df_submission


def center(X: torch.Tensor):
    mean = X.mean(dim=0, keepdim=True)
    return X - mean, mean

def pad(x: torch.Tensor, M: int) -> torch.Tensor:
    N, D = x.shape
    if M < D:
        raise ValueError(f"M={M} must be >= D={D}")
    
    pad = (0, M - D)  
    return F.pad(x, pad, mode='constant', value=0.0)



def get_data(data_path: Path):
    data = np.load(data_path)
    caption_embeddings = data['captions/embeddings']
    image_embeddings = data['images/embeddings']
    caption_labels = data['captions/label']
    data.close()

    X_abs, y_abs = torch.tensor(caption_embeddings), torch.tensor(image_embeddings[np.argmax(caption_labels, axis=1)])

    return X_abs, y_abs

def get_datasets(X_abs, y_abs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    print('Texts shape', X_abs.shape)
    print('Images shape', y_abs.shape)

    dataset = TensorDataset(X_abs, y_abs)
    train_dataset, val_dataset = random_split(dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(42))

    return train_dataset, val_dataset

def test(val_dataset: TensorDataset, model: nn.Module, device):
    val_loader = DataLoader(val_dataset, batch_size=len(val_dataset))
    for x_val, y_val in val_loader:
        results = eval_on_val(x_val, y_val, model=model, device=device)
    return results

In [None]:
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

def info_nce_loss(
    dir_preds,
    img_targets,
    logit_scale: torch.Tensor
):
    logit_scale = torch.clamp(logit_scale, min=np.log(0.01), max=np.log(100))
    logits = dir_preds @ img_targets.T * logit_scale.exp()
    labels = torch.arange(logits.size(0), device=logits.device)

    loss_t2i = F.cross_entropy(logits, labels)
    loss_i2t = F.cross_entropy(logits.T, labels)
    return 0.5 * (loss_t2i + loss_i2t)


def l2_regularization(model, lambda_l2):
    l2_norm = sum(p.pow(2.0).sum() for name, p in model.named_parameters() if "bias" not in name and "norm" not in name)
    return lambda_l2 * l2_norm

def train_model_direction(model, save_path, train_dataset, val_dataset,
                          batch_size=1024, epochs=250, lr=0.01, patience=5,
                          weight_decay=1e-4):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    #optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-8, weight_decay=1e-6)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.4, patience=1, threshold=0.001, min_lr=1e-5
    )
    
    best_mrr = float('-inf')
    no_improvements = 0

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}", leave=False)

        for X_batch, y_batch in progress_bar:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            
            output = model(X_batch)
            loss = info_nce_loss(output, y_batch, model.logit_scale)

            loss.backward()
            
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_train_loss = running_loss / len(train_loader)

        # Validation
        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                output = model(X_batch)

                loss = info_nce_loss(output, y_batch, model.logit_scale)

                running_val_loss += loss.item()
                
        avg_val_loss = running_val_loss / len(val_loader)

        results = test(val_dataset, model, device)
        mrr = results['mrr']

        scheduler.step(mrr)

        print(f"Epoch {epoch:03d} | Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f} | MRR: {mrr:.6f} | Recall-1: {results['recall_at_1']:.6f} | LR: {optimizer.param_groups[0]['lr']:.2e}")

        if mrr > best_mrr:
            best_mrr = mrr
            no_improvements = 0
            Path(save_path).parent.mkdir(parents=True, exist_ok=True)
            torch.save(model.state_dict(), save_path)
            print(f"üíæ Saved new best model (MRR={mrr:.6f})")
        else:
            no_improvements += 1
            if no_improvements >= patience:
                print("‚èπ Early stopping triggered.")
                break

    print(f"‚úÖ Training complete. Best MRR: {best_mrr:.6f}")
    return model

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_path= '/kaggle/input/aml-competition/train/train/train.npz'
test_path= '/kaggle/input/aml-competition/test/test/test.clean.npz'

save_path = './models/model.pth'

In [5]:
X, y = get_data(data_path)

#X = pad(X, y.size(1))

# X_centered, X_center = center(X)
#y_centered, y_center = center(y)

# X_normalized = F.normalize(X_centered, p=2, dim=1)
#y_normalized = F.normalize(y_centered, p=2, dim=1)

#mean_norm_y = torch.norm(y_centered, p=2, dim=1).mean()

train_dataset, val_dataset = get_datasets(X, y)

Texts shape torch.Size([125000, 1024])
Images shape torch.Size([125000, 1536])


In [12]:
input_dim = X.shape[1]
output_dim = y.shape[1]

dropout_rate = 0.3
batch_size= 256
lr=0.001
epochs= 250
patience = 10
weight_decay = 1e-4

# {'n_layers': 2, 'n_units_l0': 2048, 'n_units_l1': 4096, 'batch_size': 1024, 'lr': 0.005656289725030447, 'weight_decay': 1.9250147274648887e-05, 'dropout_rate': 0.3, 'margin': 0.10010028680917425, 'alpha': 0.26542749032951723, 'beta': 0.4148247421194375, 'factor': 0.5085840830191826}

model_args = {
    'input_dim': input_dim,
    'output_dim': output_dim,
    'dir_hidden_dims': [1256, 2048],
    'scale_hidden_dims': [1256, 1658],
    'dropout_rate': dropout_rate,
    'activation': nn.GELU
}

model = SpaceTranslator(**model_args).to(device)

train_model_direction(model, save_path, train_dataset, val_dataset, batch_size, epochs, lr, patience, weight_decay)

print('Finished training. Now testing using best model...')

state = torch.load(save_path)
model.load_state_dict(state)
results = test(val_dataset, model, device)
print("Test Results:", results)

                                                                         

Epoch 001 | Train Loss: 4.258347 | Val Loss: 3.247358 | MRR: 0.814432 | Recall-1: 0.709240 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.814432)


                                                                         

Epoch 002 | Train Loss: 2.911488 | Val Loss: 2.564551 | MRR: 0.853874 | Recall-1: 0.765360 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.853874)


                                                                         

Epoch 003 | Train Loss: 2.358716 | Val Loss: 2.275440 | MRR: 0.880053 | Recall-1: 0.805760 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.880053)


                                                                         

Epoch 004 | Train Loss: 2.097662 | Val Loss: 2.210097 | MRR: 0.886347 | Recall-1: 0.814720 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.886347)


                                                                         

Epoch 005 | Train Loss: 1.973083 | Val Loss: 2.204925 | MRR: 0.894955 | Recall-1: 0.829280 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.894955)


                                                                         

Epoch 006 | Train Loss: 1.918701 | Val Loss: 2.249070 | MRR: 0.895934 | Recall-1: 0.831080 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.895934)


                                                                         

Epoch 007 | Train Loss: 1.886621 | Val Loss: 2.285877 | MRR: 0.897087 | Recall-1: 0.832440 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.897087)


                                                                         

Epoch 008 | Train Loss: 1.853448 | Val Loss: 2.288599 | MRR: 0.898005 | Recall-1: 0.834480 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.898005)


                                                                         

Epoch 009 | Train Loss: 1.822449 | Val Loss: 2.264381 | MRR: 0.901621 | Recall-1: 0.839680 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.901621)


                                                                          

Epoch 010 | Train Loss: 1.795289 | Val Loss: 2.267938 | MRR: 0.902949 | Recall-1: 0.842320 | LR: 1.00e-03
üíæ Saved new best model (MRR=0.902949)


                                                                          

Epoch 011 | Train Loss: 1.771618 | Val Loss: 2.262550 | MRR: 0.902072 | Recall-1: 0.840520 | LR: 1.00e-03


                                                                          

Epoch 012 | Train Loss: 1.745612 | Val Loss: 2.259720 | MRR: 0.902763 | Recall-1: 0.842120 | LR: 4.00e-04


                                                                          

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "model_weights.pth")

In [None]:
state = torch.load("model_weights.pth")
model.load_state_dict(state)
results = test(val_dataset, model, device)
print("Test Results:", results)

In [None]:
generate_submission(model, Path(test_path), output_file="normalized-output.csv", device=device, X_center=X_center, mean_norm_y=None)

In [28]:
import optuna
from optuna.pruners import MedianPruner

def objective(trial, train_dataset, val_dataset, epochs: int = 15, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # --- Parametri MLP ---
    n_dir_layers = trial.suggest_int("n_dir_layers", 2, 4)
    n_scale_layers = trial.suggest_int("n_scale_layers", 2, 4)

    layer_choices = [1024, 1152, 1546, 2048, 4096]
    dir_hidden_dims = [trial.suggest_categorical(f"dir_units_l{i}", layer_choices) for i in range(n_dir_layers)]
    scale_hidden_dims = [trial.suggest_categorical(f"scale_units_l{i}", layer_choices) for i in range(n_scale_layers)]

    # --- Parametri di training ---
    batch_size = trial.suggest_categorical("batch_size", [64, 512, 1024, 2048, 4096])
    lr = trial.suggest_float("lr", 1e-3, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_categorical("dropout_rate", [0.3, 0.4, 0.5])
    activation_name = 'GELU'
    activation = nn.GELU

    # --- DataLoader ---
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # --- Modello ---
    model_args = {
        'input_dim': 1024,
        'output_dim': 1536,
        'dir_hidden_dims': dir_hidden_dims,
        'scale_hidden_dims': scale_hidden_dims,
        'dropout_rate': dropout_rate,
        'activation': activation
    }
    model = SpaceTranslator(**model_args).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=1, threshold=0.01, min_lr=1e-7
    )

    best_mrr = 0.0
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            output = model(X_batch)
            
            loss = info_nce_loss(output, y_batch, model.logit_scale)
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        # --- Validation ---
        model.eval()
        val_loss = 0.0
        with torch.inference_mode():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                output = model(X_batch)
                
                loss = info_nce_loss(output, y_batch, model.logit_scale)
                
                val_loss += loss.item()
        val_loss /= len(val_loader)

        # --- MRR ---
        results = test(val_dataset, model, device)
        mrr = results['mrr']

        scheduler.step(mrr)

        trial.report(mrr, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        print(f"[Trial {trial.number} | Epoch {epoch+1}/{epochs}] "
              f"Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} "
              f"MRR: {mrr:.4f} Activation: {activation_name} | LR: {optimizer.param_groups[0]['lr']:.2e}")

        best_mrr = max(best_mrr, mrr)

    return best_mrr


def run_optuna_search(data_path: Path, n_trials: int = 150, epochs: int = 30, n_jobs: int = 1, sampler=None, pruner=None):
    if pruner is None:
        pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=1)

    X, y = get_data(data_path)
    train_dataset, val_dataset = get_datasets(X, y)

    study = optuna.create_study(direction="maximize", pruner=pruner)
    func = lambda trial: objective(trial, train_dataset=train_dataset, val_dataset=val_dataset, epochs=epochs)
    study.optimize(func, n_trials=n_trials, n_jobs=n_jobs)

    print("Study statistics:")
    print("  Number of finished trials: ", len(study.trials))
    print("  Best trial:")
    trial = study.best_trial
    print("    Value: ", trial.value)
    print("    Params: ")
    for k, v in trial.params.items():
        print(f"      {k}: {v}")

    return study

In [None]:
study = run_optuna_search(data_path=data_path, n_trials=100, epochs=13, n_jobs=1)

study.trials_dataframe().to_csv("optuna_trials.csv", index=False)

best_trial_number = study.best_trial.number
print("Best params:", study.best_params)
print("Best trial number:", study.best_trial.number)

[I 2025-11-11 21:35:33,399] A new study created in memory with name: no-name-56522a5f-9b06-4706-ae68-92f8c4e33c72


Texts shape torch.Size([125000, 1024])
Images shape torch.Size([125000, 1536])
[Trial 0 | Epoch 1/13] Train Loss: 7.9384 Val Loss: 4.8438 MRR: 0.4728 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 2/13] Train Loss: 4.8685 Val Loss: 3.9104 MRR: 0.6119 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 3/13] Train Loss: 4.1813 Val Loss: 3.3634 MRR: 0.6947 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 4/13] Train Loss: 3.6294 Val Loss: 2.9052 MRR: 0.7596 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 5/13] Train Loss: 3.1321 Val Loss: 2.4805 MRR: 0.8125 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 6/13] Train Loss: 2.7075 Val Loss: 2.1940 MRR: 0.8452 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 7/13] Train Loss: 2.4064 Val Loss: 1.9968 MRR: 0.8664 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 8/13] Train Loss: 2.1676 Val Loss: 1.8525 MRR: 0.8813 Activation: GELU | LR: 1.67e-03
[Trial 0 | Epoch 9/13] Train Loss: 1.9850 Val Loss: 1.7481 MRR: 0.8917 Activation: GELU | LR: 1.6

[I 2025-11-11 21:37:41,863] Trial 0 finished with value: 0.9124350053374823 and parameters: {'n_dir_layers': 2, 'n_scale_layers': 3, 'dir_units_l0': 4096, 'dir_units_l1': 1024, 'scale_units_l0': 1024, 'scale_units_l1': 4096, 'scale_units_l2': 4096, 'batch_size': 1024, 'lr': 0.0016683424009548376, 'weight_decay': 0.0005431955087795497, 'dropout_rate': 0.5}. Best is trial 0 with value: 0.9124350053374823.


[Trial 0 | Epoch 13/13] Train Loss: 1.5481 Val Loss: 1.5259 MRR: 0.9124 Activation: GELU | LR: 8.34e-04
[Trial 1 | Epoch 1/13] Train Loss: 12.3153 Val Loss: 5.9400 MRR: 0.1133 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 2/13] Train Loss: 5.7441 Val Loss: 5.3401 MRR: 0.2047 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 3/13] Train Loss: 4.9555 Val Loss: 4.2188 MRR: 0.4165 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 4/13] Train Loss: 3.8922 Val Loss: 3.0773 MRR: 0.6291 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 5/13] Train Loss: 3.0373 Val Loss: 2.5333 MRR: 0.7187 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 6/13] Train Loss: 2.6623 Val Loss: 2.2903 MRR: 0.7607 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 7/13] Train Loss: 2.4908 Val Loss: 2.1766 MRR: 0.7793 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 8/13] Train Loss: 2.4067 Val Loss: 2.1567 MRR: 0.7818 Activation: GELU | LR: 8.09e-03
[Trial 1 | Epoch 9/13] Train Loss: 2.3654 Val Loss: 2.1234 MRR: 0.7887 

[I 2025-11-11 21:40:03,213] Trial 1 finished with value: 0.8257616595177402 and parameters: {'n_dir_layers': 4, 'n_scale_layers': 4, 'dir_units_l0': 2048, 'dir_units_l1': 4096, 'dir_units_l2': 1546, 'dir_units_l3': 1152, 'scale_units_l0': 2048, 'scale_units_l1': 4096, 'scale_units_l2': 1024, 'scale_units_l3': 1152, 'batch_size': 512, 'lr': 0.008085918585755997, 'weight_decay': 0.0008844564294208023, 'dropout_rate': 0.4}. Best is trial 0 with value: 0.9124350053374823.


[Trial 1 | Epoch 13/13] Train Loss: 2.0214 Val Loss: 1.8635 MRR: 0.8243 Activation: GELU | LR: 4.04e-03
[Trial 2 | Epoch 1/13] Train Loss: 21.0830 Val Loss: 6.9149 MRR: 0.2896 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 2/13] Train Loss: 7.0918 Val Loss: 6.1549 MRR: 0.4133 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 3/13] Train Loss: 6.5594 Val Loss: 5.6908 MRR: 0.4905 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 4/13] Train Loss: 6.1847 Val Loss: 5.3380 MRR: 0.5505 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 5/13] Train Loss: 5.8921 Val Loss: 5.0617 MRR: 0.5949 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 6/13] Train Loss: 5.6445 Val Loss: 4.8102 MRR: 0.6337 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 7/13] Train Loss: 5.4225 Val Loss: 4.6041 MRR: 0.6651 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 8/13] Train Loss: 5.2164 Val Loss: 4.4176 MRR: 0.6917 Activation: GELU | LR: 1.97e-03
[Trial 2 | Epoch 9/13] Train Loss: 5.0448 Val Loss: 4.2476 MRR: 0.7148 

[I 2025-11-11 21:41:59,585] Trial 2 finished with value: 0.7812792738732224 and parameters: {'n_dir_layers': 3, 'n_scale_layers': 2, 'dir_units_l0': 1546, 'dir_units_l1': 2048, 'dir_units_l2': 4096, 'scale_units_l0': 2048, 'scale_units_l1': 1546, 'batch_size': 4096, 'lr': 0.0019653102886818667, 'weight_decay': 0.00011464979658446043, 'dropout_rate': 0.5}. Best is trial 0 with value: 0.9124350053374823.


[Trial 2 | Epoch 13/13] Train Loss: 4.4684 Val Loss: 3.7390 MRR: 0.7813 Activation: GELU | LR: 1.97e-03
[Trial 3 | Epoch 1/13] Train Loss: 22.6874 Val Loss: 6.5946 MRR: 0.1245 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 2/13] Train Loss: 6.4920 Val Loss: 6.3019 MRR: 0.1636 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 3/13] Train Loss: 6.1768 Val Loss: 5.8612 MRR: 0.2369 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 4/13] Train Loss: 5.7790 Val Loss: 5.5207 MRR: 0.2948 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 5/13] Train Loss: 5.4643 Val Loss: 5.1651 MRR: 0.3556 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 6/13] Train Loss: 5.1543 Val Loss: 4.8575 MRR: 0.4191 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 7/13] Train Loss: 4.7750 Val Loss: 4.3884 MRR: 0.5040 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 8/13] Train Loss: 4.3653 Val Loss: 3.9700 MRR: 0.5816 Activation: GELU | LR: 7.03e-03
[Trial 3 | Epoch 9/13] Train Loss: 3.9488 Val Loss: 3.5523 MRR: 0.6500 

[I 2025-11-11 21:44:14,231] Trial 3 finished with value: 0.8116939470665486 and parameters: {'n_dir_layers': 3, 'n_scale_layers': 3, 'dir_units_l0': 1546, 'dir_units_l1': 2048, 'dir_units_l2': 4096, 'scale_units_l0': 2048, 'scale_units_l1': 4096, 'scale_units_l2': 2048, 'batch_size': 1024, 'lr': 0.007031922687050803, 'weight_decay': 0.0007220316141319712, 'dropout_rate': 0.3}. Best is trial 0 with value: 0.9124350053374823.


[Trial 3 | Epoch 13/13] Train Loss: 2.6480 Val Loss: 2.4358 MRR: 0.8117 Activation: GELU | LR: 7.03e-03
[Trial 4 | Epoch 1/13] Train Loss: 8.1181 Val Loss: 4.2080 MRR: 0.5629 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 2/13] Train Loss: 4.2217 Val Loss: 3.4552 MRR: 0.6801 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 3/13] Train Loss: 3.6553 Val Loss: 2.9902 MRR: 0.7436 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 4/13] Train Loss: 3.2606 Val Loss: 2.6885 MRR: 0.7848 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 5/13] Train Loss: 2.9434 Val Loss: 2.4478 MRR: 0.8157 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 6/13] Train Loss: 2.6722 Val Loss: 2.2616 MRR: 0.8372 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 7/13] Train Loss: 2.4466 Val Loss: 2.1013 MRR: 0.8555 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 8/13] Train Loss: 2.2541 Val Loss: 1.9800 MRR: 0.8683 Activation: GELU | LR: 3.81e-03
[Trial 4 | Epoch 9/13] Train Loss: 2.0927 Val Loss: 1.8749 MRR: 0.8786 A

[I 2025-11-11 21:45:37,864] Trial 4 finished with value: 0.9003638838989597 and parameters: {'n_dir_layers': 2, 'n_scale_layers': 2, 'dir_units_l0': 1024, 'dir_units_l1': 1024, 'scale_units_l0': 1024, 'scale_units_l1': 1152, 'batch_size': 1024, 'lr': 0.0038099846369201162, 'weight_decay': 4.1610335878196244e-05, 'dropout_rate': 0.4}. Best is trial 0 with value: 0.9124350053374823.


[Trial 4 | Epoch 13/13] Train Loss: 1.6055 Val Loss: 1.6292 MRR: 0.9004 Activation: GELU | LR: 1.90e-03
[Trial 5 | Epoch 1/13] Train Loss: 3.4151 Val Loss: 1.3203 MRR: 0.6349 Activation: GELU | LR: 5.62e-03
[Trial 5 | Epoch 2/13] Train Loss: 1.3753 Val Loss: 1.2243 MRR: 0.6651 Activation: GELU | LR: 5.62e-03
