### model.py

In [18]:
import torch
from typing import Optional
from torch import nn
from torch.nn import functional as F

class Translator(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        dir_hidden_dims: list[int],
        scale_hidden_dims: list[int],
        activation=nn.ReLU,
        dropout_rate: float=0.3
    ):
        super().__init__()

        def build_mlp(hidden_dims, out_dim, apply_softplus=False):
            layers = []
            last_dim = input_dim
            for hidden in hidden_dims:
                layers += [
                    nn.Linear(last_dim, hidden),
                    activation(),
                    nn.LayerNorm(hidden),
                    nn.Dropout(dropout_rate)
                ]
                last_dim = hidden
            layers.append(nn.Linear(last_dim, out_dim))
            
            if apply_softplus:
                layers.append(nn.Softplus())
            
            return nn.Sequential(*layers)

        self.dir_head = build_mlp(dir_hidden_dims, output_dim, apply_softplus=False)
        self.scale_head = build_mlp(scale_hidden_dims, 1, apply_softplus=True)

        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1/0.07))

        self.apply(self.init_weights)

    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0.0)
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, x):
        direction = self.dir_head(x)
        scale = self.scale_head(x)
        
        return F.normalize(direction, p=2, dim=-1) * scale

### eval.py

In [19]:
from pathlib import Path
import numpy as np
import pandas as pd

'''Code from https://github.com/Mamiglia/challenge'''

def mrr(pred_indices: np.ndarray, gt_indices: np.ndarray) -> float:
    """
    Compute Mean Reciprocal Rank (MRR)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries (top-K)
        gt_indices: (N,) array of ground truth indices
    Returns:
        mrr: Mean Reciprocal Rank
    """
    reciprocal_ranks = []
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i] == gt_indices[i])[0]
        if matches.size > 0:
            reciprocal_ranks.append(1.0 / (matches[0] + 1))
        else:
            reciprocal_ranks.append(0.0)
    return np.mean(reciprocal_ranks)


def recall_at_k(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int) -> float:
    """Compute Recall@k
    Args:
        pred_indices: (N, N) array of top indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        recall: Recall@k
    """
    recall = 0
    for i in range(len(gt_indices)):
        if gt_indices[i] in pred_indices[i, :k]:
            recall += 1
    recall /= len(gt_indices)
    return recall

import numpy as np

def ndcg(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int = 100) -> float:
    """
    Compute Normalized Discounted Cumulative Gain (NDCG@k)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        ndcg: NDCG@k
    """
    ndcg_total = 0.0
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i, :k] == gt_indices[i])[0]
        if matches.size > 0:
            rank = matches[0] + 1
            ndcg_total += 1.0 / np.log2(rank + 1)  # DCG (IDCG = 1)
    return ndcg_total / len(gt_indices)



@torch.inference_mode()
def evaluate_retrieval(translated_embd, image_embd, gt_indices, max_indices = 99, batch_size=100):
    """Evaluate retrieval performance using cosine similarity
    Args:
        translated_embd: (N_captions, D) translated caption embeddings
        image_embd: (N_images, D) image embeddings
        gt_indices: (N_captions,) ground truth image indices for each caption
        max_indices: number of top predictions to consider
    Returns:
        results: dict of evaluation metrics
    
    """
    # Compute similarity matrix
    if isinstance(translated_embd, np.ndarray):
        translated_embd = torch.from_numpy(translated_embd).float()
    if isinstance(image_embd, np.ndarray):
        image_embd = torch.from_numpy(image_embd).float()
    
    n_queries = translated_embd.shape[0]
    device = translated_embd.device
    
    # Prepare containers for the fragments to be reassembled
    all_sorted_indices = []
    l2_distances = []
    
    # Process in batches - the narrow gate approach
    for start_idx in range(0, n_queries, batch_size):
        batch_slice = slice(start_idx, min(start_idx + batch_size, n_queries))
        batch_translated = translated_embd[batch_slice]
        batch_img_embd = image_embd[batch_slice]
        
        # Compute similarity only for this batch
        batch_similarity = batch_translated @ batch_img_embd.T

        # Get top-k predictions for this batch
        batch_indices = batch_similarity.topk(k=max_indices, dim=1, sorted=True).indices.numpy()
        all_sorted_indices.append(gt_indices[batch_slice][batch_indices])

        # Compute L2 distance for this batch
        batch_gt = gt_indices[batch_slice]
        batch_gt_embeddings = image_embd[batch_gt]
        batch_l2 = (batch_translated - batch_gt_embeddings).norm(dim=1)
        l2_distances.append(batch_l2)
    
    # Reassemble the fragments
    sorted_indices = np.concatenate(all_sorted_indices, axis=0)
    
    # Apply the sacred metrics to the whole
    metrics = {
        'mrr': mrr,
        'ndcg': ndcg,
        'recall_at_1': lambda preds, gt: recall_at_k(preds, gt, 1),
        'recall_at_3': lambda preds, gt: recall_at_k(preds, gt, 3),
        'recall_at_5': lambda preds, gt: recall_at_k(preds, gt, 5),
        'recall_at_10': lambda preds, gt: recall_at_k(preds, gt, 10),
        'recall_at_50': lambda preds, gt: recall_at_k(preds, gt, 50),
    }
    
    results = {
        name: func(sorted_indices, gt_indices)
        for name, func in metrics.items()
    }
    
    l2_dist = torch.cat(l2_distances, dim=0).mean().item()
    results['l2_dist'] = l2_dist
    
    return results

def eval_on_val(x_val: np.ndarray, y_val: np.ndarray, model: Translator, device) -> dict:
    gt_indices = torch.arange(len(y_val))
    
    model.eval()

    with torch.inference_mode():
        translated = model(x_val.to(device)).to('cpu')

    results = evaluate_retrieval(translated, y_val, gt_indices)
    
    return results

def generate_submission(model: Translator, test_path: Path, output_file="submission.csv", device=None):
    test_data = np.load(test_path)
    sample_ids = test_data['captions/ids']
    test_embds = test_data['captions/embeddings']
    test_embds = torch.from_numpy(test_embds).float()

    with torch.no_grad():
        pred_embds = model(test_embds.to(device)).cpu()

    print("Generating submission file...")

    if isinstance(pred_embds, torch.Tensor):
        pred_embds = pred_embds.cpu().numpy()

    df_submission = pd.DataFrame({'id': sample_ids, 'embedding': pred_embds.tolist()})

    df_submission.to_csv(output_file, index=False, float_format='%.17g')
    print(f"✓ Saved submission to {output_file}")

    return df_submission

### configs

### main.py

In [20]:
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import random_split
from tqdm import tqdm


def info_nce_loss(preds_norm, targets_norm, logit_scale):
    logits = (preds_norm @ targets_norm.T) / logit_scale.exp()
    labels = torch.arange(logits.size(0), device=logits.device)

    loss_t2i = F.cross_entropy(logits, labels)          
    loss_i2t = F.cross_entropy(logits.T, labels)        
    
    return 0.5 * (loss_t2i + loss_i2t)


def mse_loss(preds, targets):
    pred_norms = preds.norm(dim=-1)
    target_norms = targets.norm(dim=-1)
    
    return F.mse_loss(pred_norms, target_norms)


def combined_loss(preds: torch.Tensor, targets: torch.Tensor, logit_scale: float, lamb: float = 1.0):
    preds_norm = F.normalize(preds, p=2, dim=1)
    targets_norm = F.normalize(targets, p=2, dim=1)

    l1 = info_nce_loss(preds_norm, targets_norm, logit_scale)
    l2 = mse_loss(preds, targets)

    return l1 + lamb * l2


def train_model(
    model: Translator,
    model_path: Path,
    train_dataset: TensorDataset,
    val_dataset: TensorDataset,
    batch_size: int,
    epochs: int,
    lr: float,
    patience: int,
    temp: float,
    lambda_mag: float
) -> Translator:    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Using device: {device}")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)

    best_val_loss = float('inf')
    no_improvements = 0

    for epoch in range(epochs):
        model.train()

        train_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()

            outputs = model(X_batch)

            loss = combined_loss(outputs, y_batch, model.logit_scale, lambda_mag)
            
            loss.backward()

            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        model.eval()

        val_loss = 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                
                loss = combined_loss(outputs, y_batch, model.logit_scale, lambda_mag)
                
                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")
        test(val_dataset, model, device)


        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improvements = 0

            Path(model_path).parent.mkdir(parents=True, exist_ok=True)

            torch.save(model.state_dict(), model_path)

            print(f"✓ Saved best model (val_loss={val_loss:.6f})")
        elif no_improvements >= patience:
            return model
        else:
            no_improvements += 1

    return model



def load_data(data_path: Path):
    data = np.load(data_path)
    caption_embeddings = data['captions/embeddings']
    image_embeddings = data['images/embeddings']
    caption_labels = data['captions/label']

    X_abs, y_abs = torch.tensor(caption_embeddings), torch.tensor(image_embeddings[np.argmax(caption_labels, axis=1)])
    
    print('Texts shape', X_abs.shape)
    print('Images shape', X_abs.shape)

    def print_stats():
        mean_X = X_abs.mean(dim=0)
        std_X = X_abs.std(dim=0)
        
        mean_Y = y_abs.mean(dim=0)
        std_Y = y_abs.std(dim=0)

        print("X: mean of stds per dim =", std_X.mean().item(), ", max =", std_X.max().item(), ", min =", std_X.min().item())
        print("Y: mean of stds per dim =", std_Y.mean().item(), ", max =", std_Y.max().item(), ", min =", std_Y.min().item())

    print_stats()
    
    dataset = TensorDataset(X_abs, y_abs)
    train_dataset, val_dataset = random_split(dataset, [0.9, 0.1], generator=torch.Generator().manual_seed(42))
    
    return train_dataset, val_dataset


def test(val_dataset: TensorDataset, model: Translator, device):
    val_loader = DataLoader(val_dataset, batch_size=len(val_dataset))
    for x_val, y_val in val_loader:
        results = eval_on_val(x_val, y_val, model=model, device=device)
    return results

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size= 2048
lr= 0.001
epochs= 200
patience = 10
temp = 0.011284474643610163
lambda_mag = 0.7763296874424117
dropout_rate = 0.3

In [25]:
data_path= '/kaggle/input/aml-competition/train/train/train.npz'
test_path= '/kaggle/input/aml-competition/test/test/test.clean.npz'

model_save_path= './models/exp1.pth'

train_dataset, val_dataset = load_data(data_path)

Texts shape torch.Size([125000, 1024])
Images shape torch.Size([125000, 1024])
X: mean of stds per dim = 0.788078248500824 , max = 3.573546886444092 , min = 0.3716050386428833
Y: mean of stds per dim = 0.4244377911090851 , max = 1.8597956895828247 , min = 0.08161858469247818


In [26]:
model_args = {
    'input_dim': 1024,
    'output_dim': 1536,
    'dir_hidden_dims': [2048, 2048],
    'scale_hidden_dims': [1024, 1024],
    'activation': nn.GELU,
    'dropout_rate': dropout_rate
}
model = Translator(**model_args).to(device)

train_model(model, model_save_path, train_dataset, val_dataset, batch_size, epochs, lr, patience, temp, lambda_mag)

print('Finished training. Now testing using best model...')

state = torch.load(model_save_path)
model.load_state_dict(state)
results = test(val_dataset, model, device)

print("Test Results:", results)

Using device: cuda


Epoch 1/200: 100%|██████████| 55/55 [00:03<00:00, 18.20it/s]


Epoch 1: Train Loss = 25.285440, Val Loss = 8.752539
✓ Saved best model (val_loss=8.752539)


Epoch 2/200: 100%|██████████| 55/55 [00:02<00:00, 18.36it/s]


Epoch 2: Train Loss = 9.728263, Val Loss = 8.655740
✓ Saved best model (val_loss=8.655740)


Epoch 3/200: 100%|██████████| 55/55 [00:03<00:00, 18.31it/s]


Epoch 3: Train Loss = 9.568067, Val Loss = 8.625210
✓ Saved best model (val_loss=8.625210)


Epoch 4/200: 100%|██████████| 55/55 [00:03<00:00, 18.10it/s]


Epoch 4: Train Loss = 9.457265, Val Loss = 8.607217
✓ Saved best model (val_loss=8.607217)


Epoch 5/200: 100%|██████████| 55/55 [00:03<00:00, 18.29it/s]


Epoch 5: Train Loss = 9.392669, Val Loss = 8.569085
✓ Saved best model (val_loss=8.569085)


Epoch 6/200: 100%|██████████| 55/55 [00:02<00:00, 18.42it/s]


Epoch 6: Train Loss = 9.328710, Val Loss = 8.557086
✓ Saved best model (val_loss=8.557086)


Epoch 7/200: 100%|██████████| 55/55 [00:03<00:00, 18.31it/s]


Epoch 7: Train Loss = 9.287960, Val Loss = 8.564676


Epoch 8/200: 100%|██████████| 55/55 [00:02<00:00, 18.35it/s]


Epoch 8: Train Loss = 9.235717, Val Loss = 8.527882
✓ Saved best model (val_loss=8.527882)


Epoch 9/200: 100%|██████████| 55/55 [00:03<00:00, 17.39it/s]


Epoch 9: Train Loss = 9.201323, Val Loss = 8.509953
✓ Saved best model (val_loss=8.509953)


Epoch 10/200: 100%|██████████| 55/55 [00:03<00:00, 17.43it/s]


Epoch 10: Train Loss = 9.172704, Val Loss = 8.500538
✓ Saved best model (val_loss=8.500538)


Epoch 11/200: 100%|██████████| 55/55 [00:03<00:00, 17.45it/s]


Epoch 11: Train Loss = 9.143517, Val Loss = 8.516369


Epoch 12/200: 100%|██████████| 55/55 [00:03<00:00, 17.60it/s]


Epoch 12: Train Loss = 9.110879, Val Loss = 8.486452
✓ Saved best model (val_loss=8.486452)


Epoch 13/200: 100%|██████████| 55/55 [00:03<00:00, 18.11it/s]


Epoch 13: Train Loss = 9.080006, Val Loss = 8.474337
✓ Saved best model (val_loss=8.474337)


Epoch 14/200: 100%|██████████| 55/55 [00:03<00:00, 18.32it/s]


Epoch 14: Train Loss = 9.050531, Val Loss = 8.500418


Epoch 15/200: 100%|██████████| 55/55 [00:03<00:00, 17.38it/s]


Epoch 15: Train Loss = 9.025435, Val Loss = 8.434955
✓ Saved best model (val_loss=8.434955)


Epoch 16/200: 100%|██████████| 55/55 [00:03<00:00, 17.52it/s]


Epoch 16: Train Loss = 9.017629, Val Loss = 8.434065
✓ Saved best model (val_loss=8.434065)


Epoch 17/200: 100%|██████████| 55/55 [00:03<00:00, 17.41it/s]


Epoch 17: Train Loss = 8.986767, Val Loss = 8.413875
✓ Saved best model (val_loss=8.413875)


Epoch 18/200: 100%|██████████| 55/55 [00:03<00:00, 17.58it/s]


Epoch 18: Train Loss = 8.966143, Val Loss = 8.421333


Epoch 19/200: 100%|██████████| 55/55 [00:03<00:00, 17.43it/s]


Epoch 19: Train Loss = 8.946474, Val Loss = 8.438383


Epoch 20/200: 100%|██████████| 55/55 [00:03<00:00, 17.64it/s]


Epoch 20: Train Loss = 8.924216, Val Loss = 8.385660
✓ Saved best model (val_loss=8.385660)


Epoch 21/200: 100%|██████████| 55/55 [00:03<00:00, 18.19it/s]


Epoch 21: Train Loss = 8.894061, Val Loss = 8.389205


Epoch 22/200: 100%|██████████| 55/55 [00:03<00:00, 17.59it/s]


Epoch 22: Train Loss = 8.866160, Val Loss = 8.393365


Epoch 23/200: 100%|██████████| 55/55 [00:03<00:00, 17.47it/s]


Epoch 23: Train Loss = 8.834946, Val Loss = 8.364804
✓ Saved best model (val_loss=8.364804)


Epoch 24/200: 100%|██████████| 55/55 [00:03<00:00, 17.55it/s]


Epoch 24: Train Loss = 8.793158, Val Loss = 8.310293
✓ Saved best model (val_loss=8.310293)


Epoch 25/200: 100%|██████████| 55/55 [00:03<00:00, 18.32it/s]


Epoch 25: Train Loss = 8.766323, Val Loss = 8.279573
✓ Saved best model (val_loss=8.279573)


Epoch 26/200: 100%|██████████| 55/55 [00:02<00:00, 18.41it/s]


Epoch 26: Train Loss = 8.736307, Val Loss = 8.278984
✓ Saved best model (val_loss=8.278984)


Epoch 27/200: 100%|██████████| 55/55 [00:03<00:00, 17.53it/s]


Epoch 27: Train Loss = 8.698032, Val Loss = 8.231020
✓ Saved best model (val_loss=8.231020)


Epoch 28/200: 100%|██████████| 55/55 [00:03<00:00, 17.51it/s]


Epoch 28: Train Loss = 8.652296, Val Loss = 8.235891


Epoch 29/200: 100%|██████████| 55/55 [00:03<00:00, 17.54it/s]


Epoch 29: Train Loss = 8.614051, Val Loss = 8.198982
✓ Saved best model (val_loss=8.198982)


Epoch 30/200: 100%|██████████| 55/55 [00:03<00:00, 17.68it/s]


Epoch 30: Train Loss = 8.557338, Val Loss = 8.141220
✓ Saved best model (val_loss=8.141220)


Epoch 31/200: 100%|██████████| 55/55 [00:03<00:00, 17.48it/s]


Epoch 31: Train Loss = 8.502890, Val Loss = 8.129197
✓ Saved best model (val_loss=8.129197)


Epoch 32/200: 100%|██████████| 55/55 [00:03<00:00, 17.50it/s]


Epoch 32: Train Loss = 8.452228, Val Loss = 8.096118


KeyboardInterrupt: 

In [None]:
generate_submission(model, Path(test_path), device=device)

In [None]:
from torchsummary import summary
summary(model, input_size=(1024,))

In [27]:
import optuna
from optuna.pruners import MedianPruner
from pathlib import Path

ACTIVATIONS = {
    #"relu": nn.ReLU,
    "gelu": nn.GELU,
    "silu": nn.SiLU,
    'selu': nn.SELU,
    'celu': nn.CELU
    #"leakyrelu": nn.LeakyReLU
}

def objective(trial, train_dataset, val_dataset, input_dim, output_dim, epochs: int = 10, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # --- Hyperparametri ---
    dir_n_layers = trial.suggest_int("dir_n_layers", 1, 6)
    dir_hidden_dims_choices = [1024, 1536, 2048, 4096]
    dir_hidden_dims = [trial.suggest_categorical(f"dir_l{i}_units", dir_hidden_dims_choices) for i in range(dir_n_layers)]

    scale_n_layers = trial.suggest_int("scale_n_layers", 1, 6)
    scale_hidden_dims_choices = [128, 256, 512, 1024, 1536, 2048, 4096]
    scale_hidden_dims = [trial.suggest_categorical(f"scale_l{i}_units", scale_hidden_dims_choices) for i in range(scale_n_layers)]

    activation_name = trial.suggest_categorical("activation", list(ACTIVATIONS.keys()))
    activation_fn = ACTIVATIONS[activation_name]

    batch_size = trial.suggest_categorical("batch_size", [128, 256, 512, 1024, 2048, 4096])
    lr = trial.suggest_float("lr", 1e-6, 1e-2, log=True)
    dropout_rate = trial.suggest_categorical('dropout_rate', [0.1, 0.2, 0.25, 0.3])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    #temp = trial.suggest_float("temp", 0.01, 0.2, log=True)
    
    lambda_mag = trial.suggest_float("lambda_mag", 0.2, 1, log=True)

    # --- Modello ---
    model_args = {
        'input_dim': input_dim,
        'output_dim': output_dim,
        'dir_hidden_dims': dir_hidden_dims,
        'scale_hidden_dims': scale_hidden_dims,
        'activation': activation_fn,
        'dropout_rate': dropout_rate
    }
    model = Translator(**model_args).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            
            outputs = model(X_batch)
            
            loss = combined_loss(outputs, y_batch, model.logit_scale, lambda_mag)
            
            loss.backward()
            
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)

        # --- Validation ---
        model.eval()
        
        val_loss = 0.0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
                outputs = model(X_batch)
                val_loss += combined_loss(outputs, y_batch, model.logit_scale, lambda_mag).item()
        
        val_loss /= len(val_loader)

        results = test(val_dataset, model, device)
        trial.report(results['mrr'], epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return results['mrr']


def run_optuna_search(data_path: Path, n_trials: int = 30, epochs: int = 30, n_jobs: int = 1, sampler=None, pruner=None):
    if pruner is None:
        pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=1)

    train_dataset, val_dataset = load_data(data_path)
    input_dim = train_dataset[0][0].shape[0]
    output_dim = train_dataset[0][1].shape[0]

    study = optuna.create_study(direction="maximize", pruner=pruner)
    func = lambda trial: objective(trial, train_dataset=train_dataset, val_dataset=val_dataset,
                                   input_dim=input_dim, output_dim=output_dim,
                                   epochs=epochs)
    study.optimize(func, n_trials=n_trials, n_jobs=n_jobs)

    print("Study statistics:")
    print("  Number of finished trials: ", len(study.trials))
    print("  Best trial:")
    trial = study.best_trial
    print("    Value: ", trial.value)
    print("    Params: ")
    for k, v in trial.params.items():
        print(f"      {k}: {v}")

    return study


In [None]:
study = run_optuna_search(data_path=data_path, n_trials=500, epochs=10, n_jobs=1)

Texts shape torch.Size([125000, 1024])
Images shape torch.Size([125000, 1024])


[I 2025-11-03 21:03:09,042] A new study created in memory with name: no-name-cc70a51e-8e09-4843-a927-6c0caf0f91df


X: mean of stds per dim = 0.788078248500824 , max = 3.573546886444092 , min = 0.3716050386428833
Y: mean of stds per dim = 0.4244377911090851 , max = 1.8597956895828247 , min = 0.08161858469247818


[I 2025-11-03 21:04:03,121] Trial 0 finished with value: 0.6050784127383727 and parameters: {'dir_n_layers': 2, 'dir_l0_units': 2048, 'dir_l1_units': 1536, 'scale_n_layers': 2, 'scale_l0_units': 1536, 'scale_l1_units': 1024, 'activation': 'celu', 'batch_size': 4096, 'lr': 9.673498046526665e-05, 'dropout_rate': 0.3, 'lambda_mag': 0.4691544144696144}. Best is trial 0 with value: 0.6050784127383727.
[I 2025-11-03 21:05:10,762] Trial 1 finished with value: 0.3886452068046552 and parameters: {'dir_n_layers': 3, 'dir_l0_units': 1024, 'dir_l1_units': 1024, 'dir_l2_units': 2048, 'scale_n_layers': 4, 'scale_l0_units': 512, 'scale_l1_units': 512, 'scale_l2_units': 4096, 'scale_l3_units': 1024, 'activation': 'gelu', 'batch_size': 256, 'lr': 2.7403039601983457e-06, 'dropout_rate': 0.3, 'lambda_mag': 0.33412562946208335}. Best is trial 0 with value: 0.6050784127383727.
[I 2025-11-03 21:06:18,061] Trial 2 finished with value: 0.2633644598365897 and parameters: {'dir_n_layers': 6, 'dir_l0_units': 102

In [None]:
study.trials_dataframe().to_csv("optuna_trials.csv", index=False)

best_trial_number = study.best_trial.number
print("Best params:", study.best_params)
print("Best trial number:", study.best_trial.number)