## Notebook for Constrastive Learning + linear alignment

### Import Libaries

In [1]:
#!mkdir data
#!gdown 1CVAQDuPOiwm8h9LJ8a_oOs6zOWS6EgkB
#!gdown 1ykZ9fjTxUwdiEwqagoYZiMcD5aG-7rHe
#!unzip -o test.zip -d data
#!unzip -o train.zip -d data
from google.colab import drive
drive.mount('/content/drive')
!git clone https://github.com/Mamiglia/challenge.git
!wget https://raw.githubusercontent.com/tam4x/aml_challenge/refs/heads/main/preprocess_data.py


Mounted at /content/drive
Cloning into 'challenge'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 98 (delta 39), reused 72 (delta 26), pack-reused 0 (from 0)[K
Receiving objects: 100% (98/98), 21.03 MiB | 22.26 MiB/s, done.
Resolving deltas: 100% (39/39), done.
--2025-11-12 08:02:08--  https://raw.githubusercontent.com/tam4x/aml_challenge/refs/heads/main/preprocess_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8066 (7.9K) [text/plain]
Saving to: ‘preprocess_data.py’


2025-11-12 08:02:08 (10.9 MB/s) - ‘preprocess_data.py’ saved [8066/8066]



In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from pathlib import Path
from tqdm import tqdm

from challenge.src.common import load_data, prepare_train_data, generate_submission

In [6]:
!python preprocess_data.py drive/MyDrive/data/train --device=cuda --output-file=drive/MyDrive/data/augmented_data.npz --num-augmentations=4

2025-11-12 11:40:56.335796: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-12 11:40:56.353596: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762947656.374791   56154 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762947656.381327   56154 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762947656.397661   56154 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

### Create Neural Network Architectures

In [8]:
class TransformerTranslator(nn.Module):
    """
    Transformer-style translator from text embedding -> image embedding
    """
    def __init__(self, text_dim=1024, img_dim=1536, n_heads=8, n_layers=2, dim_feedforward=2048, dropout=0.2):
        super().__init__()
        self.input_ln = nn.LayerNorm(text_dim)
        self.proj_in = nn.Linear(text_dim, img_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=img_dim,
            nhead=n_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='gelu',
            batch_first=True  # for (B, Seq, Dim)
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.output_ln = nn.LayerNorm(img_dim)

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(1)  # (B, 1, text_dim)
        x = self.input_ln(x)
        x = self.proj_in(x)  # project to model dim
        out = self.encoder(x)  # Transformer encoder
        out = out.squeeze(1)   # remove sequence dim
        return self.output_ln(out)

In [9]:
class ResidualMLPTranslator(nn.Module):
    def __init__(self, text_dim=1024, img_dim=1536, hidden_dim=2048, num_layers=3, dropout=0.2):
        super().__init__()
        assert num_layers >= 2
        self.input_ln = nn.LayerNorm(text_dim)
        self.dropout = nn.Dropout(dropout)

        # first layer: text_dim -> hidden_dim (no residual yet)
        self.first_layer = nn.Sequential(
            nn.Linear(text_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout)
        )

        # hidden residual blocks (hidden_dim -> hidden_dim)
        self.blocks = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.GELU(),
                nn.LayerNorm(hidden_dim),
                nn.Dropout(dropout)
            )
            for _ in range(num_layers - 2)
        ])

        # final projection to image space
        self.final_proj = nn.Linear(hidden_dim, img_dim)
        self.output_ln = nn.LayerNorm(img_dim)

        # input residual to output
        if text_dim != img_dim:
            self.res_proj = nn.Linear(text_dim, img_dim)
        else:
            self.res_proj = nn.Identity()

    def forward(self, x):
        x_in = self.input_ln(x)
        out = self.first_layer(x_in)
        for block in self.blocks:
            out = out + block(out)  # residual only between same-dim layers
        out = self.final_proj(out)
        out = out + self.res_proj(x_in)
        return self.output_ln(out)


In [10]:
class LatentSpaceTranslator(nn.Module):
    """
    MLP translator from text embedding -> image embedding
    Input: text_emb (batch, text_dim) or (batch, 1, text_dim)
    Output: (batch, img_dim)
    Regularization: dropout, LayerNorm, GELU, residual (optional projector)
    """
    def __init__(self,
                 text_dim=1024,
                 img_dim=1536,
                 hidden_dim=2048,
                 num_layers=3,
                 dropout=0.2,
                 use_residual=True):
        super().__init__()
        assert num_layers >= 2, "num_layers should be >= 2 (including final proj)"
        self.use_residual = use_residual
        self.input_ln = nn.LayerNorm(text_dim)
        layers = []
        in_dim = text_dim
        for i in range(num_layers - 1):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.GELU())
            layers.append(nn.LayerNorm(hidden_dim))
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        # final projection to image space
        layers.append(nn.Linear(in_dim, img_dim))
        self.net = nn.Sequential(*layers)

        # if using residual, project input to img_dim to add it at the end
        if self.use_residual:
            if text_dim != img_dim:
                self.res_proj = nn.Linear(text_dim, img_dim)
            else:
                self.res_proj = nn.Identity()

        # final layer norm in image space
        self.output_ln = nn.LayerNorm(img_dim)

    def forward(self, text_emb):
        if text_emb.dim() == 3:
            x = text_emb.squeeze(1)
        else:
            x = text_emb
        x = self.input_ln(x)
        out = self.net(x)  # (B, img_dim)
        if self.use_residual:
            res = self.res_proj(x)
            out = out + res
        return self.output_ln(out)


### Training Loop and NCE Loss aswell as Procrustes Init

In [11]:

class QueueInfoNCELoss(nn.Module):
    def __init__(self, dim, temperature=0.07, queue_size=4096):
        super().__init__()
        self.temperature = temperature
        self.queue_size = queue_size
        # queue shape: (queue_size, dim)
        self.register_buffer("queue", torch.randn(queue_size, dim))
        self.queue = F.normalize(self.queue, dim=1)
        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

    @torch.no_grad()
    def _enqueue(self, keys):
        """
        keys: tensor (B, dim), already detached, normalized, on same device as queue.
        This writes keys into the circular queue. Safe to call only AFTER backward.
        """
        batch_size = keys.shape[0]
        ptr = int(self.queue_ptr.item())
        end_ptr = (ptr + batch_size) % self.queue_size

        if end_ptr > ptr:
            self.queue[ptr:end_ptr] = keys
        else:
            # wrap
            first_len = self.queue_size - ptr
            self.queue[ptr:] = keys[:first_len]
            self.queue[:end_ptr] = keys[first_len:]
        self.queue_ptr[0] = end_ptr

    def forward(self, z_i, z_j):
        """
        Computes loss using current queue as negatives but does NOT modify the queue.
        z_i: (B, dim) predicted (text -> img)
        z_j: (B, dim) target (image)
        """
        # normalize
        z_i = F.normalize(z_i, dim=1)
        z_j = F.normalize(z_j, dim=1)

        # positive logits: (B, 1)
        l_pos = torch.sum(z_i * z_j, dim=-1, keepdim=True)

        # negative logits from queue: (B, queue_size)
        # queue is a buffer; safe to read
        l_neg = torch.matmul(z_i, self.queue.T)

        # logits: (B, 1 + queue_size)
        logits = torch.cat([l_pos, l_neg], dim=1)
        logits /= self.temperature

        labels = torch.zeros(logits.size(0), dtype=torch.long, device=z_i.device)  # positives at index 0

        loss = F.cross_entropy(logits, labels)
        return loss


In [12]:
# ====== Procrustes initialization ======
def procrustes_init(text_embs, img_embs):
    """
    text_embs: (N, d_text)
    img_embs:  (N, d_img)
    returns: weight matrix (d_img, d_text)
    """
    # Center both
    X = text_embs - text_embs.mean(0, keepdim=True)
    Y = img_embs - img_embs.mean(0, keepdim=True)

    # Compute SVD of cross-covariance
    U, _, Vt = torch.linalg.svd(X.T @ Y, full_matrices=False)
    W = U @ Vt  # orthogonal map d_text→d_img
    return W.T   # shape (d_img, d_text) for nn.Linear weight


def apply_procrustes_init_to_final(model, text_sample, img_sample):
    """
    Apply Procrustes initialization to a model.
    - For MLP / ResidualMLP: apply to final Linear layer (hidden -> img_dim)
    - For TransformerTranslator: apply to first projection (text_dim -> img_dim)
    """
    with torch.no_grad():
        # Compute Procrustes matrix
        W = procrustes_init(text_embs=text_sample, img_embs=img_sample)

        # Apply to the appropriate layer
        applied = False
        for name, m in model.named_modules():
            if isinstance(m, nn.Linear):
                # Transformer: apply to first projection (proj_in)
                if isinstance(model, TransformerTranslator) and name.endswith("proj_in"):
                    print(m.weight.shape, W.shape)
                    if m.weight.shape == W.shape:
                        m.weight.copy_(W)
                        applied = True
                        break
                # MLP / ResidualMLP: apply to final_proj
                elif isinstance(model, LatentSpaceTranslator) and name.endswith("res_proj"):
                    print(m.weight.shape, W.shape)
                    if m.weight.shape == W.shape:
                        m.weight.copy_(W)
                        applied = True
                        break

                elif isinstance(model, ResidualMLPTranslator) and name.endswith("res_proj"):
                    print(m.weight.shape, W.shape)
                    if m.weight.shape == W.shape:
                        m.weight.copy_(W)
                        applied = True
                        break

        if not applied:
            print("⚠️ Warning: Could not find matching layer for Procrustes init")
    return model


In [13]:
# ---------- Training loop with Procrustes + InfoNCE ----------
def training(model, train_loader, val_loader, device, epochs, lr, MODEL_PATH,
             use_procrustes_init=True, procrustes_subset=10000, temperature=0.07,
             queue_size=4098):
    """Train LatentSpaceTranslator with optional Procrustes init + InfoNCE loss."""
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=5e-3)
    best_val_loss = float('inf')

    # --- Optional: Procrustes initialization ---
    if use_procrustes_init:
        print("Computing Procrustes initialization...")
        text_list, img_list = [], []
        for i, (X, y) in enumerate(train_loader):
            text_list.append(X.cpu())
            img_list.append(y.cpu())
            if sum(t.shape[0] for t in text_list) >= procrustes_subset:
                break
        text_sample = torch.cat(text_list, dim=0)[:procrustes_subset]
        img_sample = torch.cat(img_list, dim=0)[:procrustes_subset]
        model = apply_procrustes_init_to_final(model, text_sample, img_sample)

    criterion = QueueInfoNCELoss(dim=1536, temperature=temperature, queue_size=queue_size).to(device)

    # --- Training ---
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()

            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            #loss += 0.1 * F.mse_loss(pred, y_batch)
            loss += 1 - F.cosine_similarity(pred, y_batch).mean()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()

            with torch.no_grad():
              keys = F.normalize(y_batch, dim=1).detach()   # image embeddings (targets) as keys
              # put them into the queue
              criterion._enqueue(keys)

        train_loss /= len(train_loader)

        # --- Validation ---
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                pred = model(X_batch)
                loss = criterion(pred, y_batch)
                loss += 1 - F.cosine_similarity(pred, y_batch).mean()
                #loss += 0.1 * F.mse_loss(pred, y_batch)
                val_loss += loss.item()

                keys = F.normalize(y_batch, dim=1).detach()   # image embeddings (targets) as keys
                criterion._enqueue(keys)


        val_loss /= len(val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")

        # --- Save best model ---
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            Path(MODEL_PATH).parent.mkdir(parents=True, exist_ok=True)
            torch.save(model.state_dict(), MODEL_PATH)
            print(f"  ✓ Saved best model (val_loss={val_loss:.6f})")

    return model

### Load the Data

In [15]:
# 4. Data Augmentation
# 5. Zero Shot Stitching
# 6. Triplet Loss / Improve InfoNCE Loss / bidirectional / SimCLR / MoCo
# 7. Autoencoder
# Configuration
EPOCHS = 100
BATCH_SIZE = 512
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load data
train_data = load_data("drive/MyDrive/data/train/train.npz")
#train_data = load_data('processed_augmented_data.npz')
X, y, label = prepare_train_data(train_data)
DATASET_SIZE = len(X)
# Split train/val
# This is done only to measure generalization capabilities, you don't have to
# use a validation set (though we encourage this)
n_train = int(0.9 * len(X))
TRAIN_SPLIT = torch.zeros(len(X), dtype=torch.bool)
TRAIN_SPLIT[:n_train] = 1
X_train, X_val = X[TRAIN_SPLIT], X[~TRAIN_SPLIT]
y_train, y_val = y[TRAIN_SPLIT], y[~TRAIN_SPLIT]

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
y_train.shape, X_train.shape, train_loader.batch_size, val_loader.batch_size

(125000,)
Train data: 125000 captions, 125000 images


(torch.Size([112500, 1536]), torch.Size([112500, 1024]), 512, 512)

### Hyperparameter Optimization

In [16]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m399.4/404.7 kB[0m [31m11.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [17]:
import optuna

def objective_extended(arch, trial, train_dataloader, val_dataloader, device, MODEL_PATH_BASE):

    # --- Common hyperparameters ---
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    #lr = trial.suggest_loguniform("lr", 5e-4, 1e-2)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-3)

    # --- New hyperparameters ---
    temperature = trial.suggest_float("temperature", 0.01, 0.2)
    queue_size = trial.suggest_categorical("queue_size", [2048, 4098, 8196])
    #w_infonce = trial.suggest_float("w_infonce", 0.6, 0.8)
    #w_cos = trial.suggest_float("w_cos", 0.4, 1.0)
    #w_mse = trial.suggest_float("w_mse", 1.0 - w_cos, 1.0)
    procrustes_subset = 10000

    # --- Architecture-specific hyperparameters ---
    if arch in ["MLP", "ResidualMLP"]:
        hidden_dim = trial.suggest_categorical("hidden_dim", [1024, 2048, 4096])
        num_layers = trial.suggest_int("num_layers", 2, 6)
        if arch == "MLP":
            model = LatentSpaceTranslator(
                text_dim=1024, img_dim=1536, hidden_dim=hidden_dim,
                num_layers=num_layers, dropout=dropout
            ).to(device)
        else:
            model = ResidualMLPTranslator(
                text_dim=1024, img_dim=1536, hidden_dim=hidden_dim,
                num_layers=num_layers, dropout=dropout
            ).to(device)
    elif arch == "Transformer":
        n_layers = trial.suggest_int("n_layers", 2, 6)
        n_heads = trial.suggest_categorical("n_heads", [4, 8, 12])
        dim_feedforward = trial.suggest_categorical("dim_feedforward", [1024, 2048, 4096])
        model = TransformerTranslator(
            text_dim=1024, img_dim=1536,
            n_heads=n_heads, n_layers=n_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        ).to(device)


    # --- Apply Procrustes initialization ---
    if procrustes_subset > 0:
        # Get subset from train_loader
        text_list, img_list = [], []
        for i, (X, y) in enumerate(train_loader):
            text_list.append(X.cpu())
            img_list.append(y.cpu())
            if sum(t.shape[0] for t in text_list) >= procrustes_subset:
                break
        text_sample = torch.cat(text_list, dim=0)[:procrustes_subset]
        img_sample = torch.cat(img_list, dim=0)[:procrustes_subset]
        model = apply_procrustes_init_to_final(model, text_sample, img_sample)

    criterion = QueueInfoNCELoss(dim=1536, temperature=temperature, queue_size=queue_size).to(device)
    # --- Training loop (short run) ---
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=weight_decay)
    model.train()
    torch.autograd.set_detect_anomaly(True)
    for epoch in range(5):  # short training
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            pred = model(X_batch)

            # Weighted combination of losses
            loss = criterion(pred, y_batch)
            #loss += 0.1 * F.mse_loss(pred, y_batch)
            loss += 1 - F.cosine_similarity(pred, y_batch).mean()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            with torch.no_grad():
              keys = F.normalize(y_batch, dim=1).detach()   # image embeddings (targets) as keys
              # put them into the queue
              criterion._enqueue(keys)

    # --- Evaluate on validation ---
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            pred = model(X_batch)
            # Use combined loss for evaluation
            loss =  criterion(pred, y_batch)
            #loss += 0.1 * F.mse_loss(pred, y_batch)
            loss += 1 - F.cosine_similarity(pred, y_batch).mean()

            val_loss += loss.item()

            keys = F.normalize(y_batch, dim=1).detach()
            criterion._enqueue(keys)

    val_loss /= len(val_loader)

    return val_loss


def run_optuna_extended(arch, train_dataloader, val_dataloader, device, MODEL_PATH_BASE, n_trials=20):
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective_extended(arch, trial, train_dataloader, val_dataloader, device, MODEL_PATH_BASE),
                   n_trials=n_trials)

    print("Best trial:")
    trial = study.best_trial
    print(f"Val loss: {trial.value}")
    print("Best hyperparameters:")
    for key, value in trial.params.items():
        print(f"  {key}: {value}")

    return trial.params

In [18]:
archs = ['MLP', 'ResidualMLP', 'Transformer']
choosen_arch = archs[0]
best_params = run_optuna_extended(
    arch = choosen_arch,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    device=DEVICE,
    MODEL_PATH_BASE="models/translator_optuna"
)

[I 2025-11-12 11:45:16,026] A new study created in memory with name: no-name-714af6b8-a7aa-4816-b8d5-feb84b4448c6
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-3)


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:45:45,058] Trial 0 finished with value: 3.8088408088684083 and parameters: {'dropout': 0.31109310280918223, 'weight_decay': 0.0003210548718670248, 'temperature': 0.10170534399911904, 'queue_size': 4098, 'hidden_dim': 4096, 'num_layers': 2}. Best is trial 0 with value: 3.8088408088684083.
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-3)


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:46:15,854] Trial 1 finished with value: 3.517496166229248 and parameters: {'dropout': 0.13180921500780782, 'weight_decay': 0.00022909790110840655, 'temperature': 0.10670498350968033, 'queue_size': 2048, 'hidden_dim': 1024, 'num_layers': 4}. Best is trial 1 with value: 3.517496166229248.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:46:52,170] Trial 2 finished with value: 3.9462575340270996 and parameters: {'dropout': 0.44108103991643033, 'weight_decay': 0.00014944392954697094, 'temperature': 0.12979795867618718, 'queue_size': 4098, 'hidden_dim': 2048, 'num_layers': 5}. Best is trial 1 with value: 3.517496166229248.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:47:25,469] Trial 3 finished with value: 3.4866329288482665 and parameters: {'dropout': 0.3804782419945405, 'weight_decay': 0.0008038361762312706, 'temperature': 0.09015851124212126, 'queue_size': 2048, 'hidden_dim': 1024, 'num_layers': 5}. Best is trial 3 with value: 3.4866329288482665.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:47:52,214] Trial 4 finished with value: 4.074366760253906 and parameters: {'dropout': 0.3375794670167134, 'weight_decay': 0.00018194089893996677, 'temperature': 0.15945605530959925, 'queue_size': 4098, 'hidden_dim': 4096, 'num_layers': 2}. Best is trial 3 with value: 3.4866329288482665.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:48:28,385] Trial 5 finished with value: 3.1395943641662596 and parameters: {'dropout': 0.35659194846054365, 'weight_decay': 3.248189176509974e-05, 'temperature': 0.0677638312673093, 'queue_size': 2048, 'hidden_dim': 2048, 'num_layers': 5}. Best is trial 5 with value: 3.1395943641662596.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:49:00,882] Trial 6 finished with value: 3.710578565597534 and parameters: {'dropout': 0.4914993262817734, 'weight_decay': 0.00012428952743359016, 'temperature': 0.1637285044242881, 'queue_size': 2048, 'hidden_dim': 2048, 'num_layers': 4}. Best is trial 5 with value: 3.1395943641662596.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:49:37,228] Trial 7 finished with value: 3.4625420665740965 and parameters: {'dropout': 0.42723310960514616, 'weight_decay': 9.980175093871379e-05, 'temperature': 0.07820884430145857, 'queue_size': 2048, 'hidden_dim': 1024, 'num_layers': 6}. Best is trial 5 with value: 3.1395943641662596.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:50:03,586] Trial 8 finished with value: 3.87760443687439 and parameters: {'dropout': 0.4492163811173664, 'weight_decay': 4.425745510730496e-05, 'temperature': 0.062187848480185715, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 2}. Best is trial 5 with value: 3.1395943641662596.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:50:39,507] Trial 9 finished with value: 3.363137502670288 and parameters: {'dropout': 0.1977297134608129, 'weight_decay': 0.00016527561502254886, 'temperature': 0.09863640049227711, 'queue_size': 2048, 'hidden_dim': 2048, 'num_layers': 5}. Best is trial 5 with value: 3.1395943641662596.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:51:20,300] Trial 10 finished with value: 3.1151181983947756 and parameters: {'dropout': 0.21483333850235065, 'weight_decay': 1.0128509977781696e-05, 'temperature': 0.011430290621441241, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 6}. Best is trial 10 with value: 3.1151181983947756.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:52:01,593] Trial 11 finished with value: 3.0133916473388673 and parameters: {'dropout': 0.2310166860441993, 'weight_decay': 1.0091659001046412e-05, 'temperature': 0.015710425551538724, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 6}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:52:43,414] Trial 12 finished with value: 3.1331115341186524 and parameters: {'dropout': 0.23287777350988823, 'weight_decay': 1.1101271014855722e-05, 'temperature': 0.011728895324403426, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 6}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:53:25,563] Trial 13 finished with value: 3.223958806991577 and parameters: {'dropout': 0.24017614001793963, 'weight_decay': 1.1948107258594625e-05, 'temperature': 0.010572757121261711, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 6}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:53:59,835] Trial 14 finished with value: 3.280655708312988 and parameters: {'dropout': 0.1050918685315026, 'weight_decay': 2.3594525762975702e-05, 'temperature': 0.036445611917069354, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 4}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:54:52,405] Trial 15 finished with value: 3.2302824306488036 and parameters: {'dropout': 0.18125283932913774, 'weight_decay': 1.9911584062345034e-05, 'temperature': 0.03865643923878595, 'queue_size': 8196, 'hidden_dim': 4096, 'num_layers': 6}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:55:22,820] Trial 16 finished with value: 3.4398776054382325 and parameters: {'dropout': 0.27987147562980463, 'weight_decay': 5.89360481345453e-05, 'temperature': 0.040165477837019015, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 3}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:56:04,912] Trial 17 finished with value: 3.0950373077392577 and parameters: {'dropout': 0.16020256544438283, 'weight_decay': 1.0003430251032675e-05, 'temperature': 0.0288167590517238, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 6}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:56:38,373] Trial 18 finished with value: 4.390023918151855 and parameters: {'dropout': 0.16030929239159294, 'weight_decay': 1.630557279436331e-05, 'temperature': 0.19110976581417005, 'queue_size': 8196, 'hidden_dim': 4096, 'num_layers': 3}. Best is trial 11 with value: 3.0133916473388673.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 11:57:13,823] Trial 19 finished with value: 3.590290079116821 and parameters: {'dropout': 0.2687017572207966, 'weight_decay': 6.673277851813183e-05, 'temperature': 0.043517097808105545, 'queue_size': 8196, 'hidden_dim': 1024, 'num_layers': 5}. Best is trial 11 with value: 3.0133916473388673.


Best trial:
Val loss: 3.0133916473388673
Best hyperparameters:
  dropout: 0.2310166860441993
  weight_decay: 1.0091659001046412e-05
  temperature: 0.015710425551538724
  queue_size: 8196
  hidden_dim: 2048
  num_layers: 6


### Training and Submission File Creation

In [19]:
if choosen_arch == 'Transformer':
    model = TransformerTranslator(
        text_dim=1024,
        img_dim=1536,
        n_heads = best_params['n_heads'],
        n_layers=best_params['n_layers'],
        dim_feedforward=best_params['dim_feedforward'],
        dropout=best_params['dropout']
    ).to(DEVICE)
    MODEL_PATH = "drive/MyDrive/data//models/transformer.pth"

elif choosen_arch == 'MLP':
    model = LatentSpaceTranslator(
    text_dim=1024,
    img_dim=1536,
    hidden_dim=best_params["hidden_dim"],
    num_layers=best_params["num_layers"],
    dropout=best_params["dropout"]).to(DEVICE)
    MODEL_PATH = "drive/MyDrive/data/models/latent_space.pth"

else:
    model = ResidualMLPTranslator(
    text_dim=1024,
    img_dim=1536,
    hidden_dim=best_params["hidden_dim"],
    num_layers=best_params["num_layers"],
    dropout=best_params["dropout"]).to(DEVICE)
    MODEL_PATH = "drive/MyDrive/data/models/residual.pth"

In [20]:
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Train
print("\n3. Training...")
model = training(model,
                 train_loader,
                 val_loader,
                 DEVICE,
                 EPOCHS,
                 1e-6,
                 MODEL_PATH,
                 True,
                 10000,
                 best_params["temperature"],
                 best_params["queue_size"])

   Parameters: 23,631,872

3. Training...
Computing Procrustes initialization...
torch.Size([1536, 1024]) torch.Size([1536, 1024])


Epoch 1/60: 100%|██████████| 220/220 [00:07<00:00, 27.76it/s]


Epoch 1: Train Loss = 3.426384, Val Loss = 3.417919
  ✓ Saved best model (val_loss=3.417919)


Epoch 2/60: 100%|██████████| 220/220 [00:07<00:00, 27.70it/s]


Epoch 2: Train Loss = 3.306604, Val Loss = 3.269392
  ✓ Saved best model (val_loss=3.269392)


Epoch 3/60: 100%|██████████| 220/220 [00:07<00:00, 28.49it/s]


Epoch 3: Train Loss = 3.185855, Val Loss = 3.171642
  ✓ Saved best model (val_loss=3.171642)


Epoch 4/60: 100%|██████████| 220/220 [00:07<00:00, 28.01it/s]


Epoch 4: Train Loss = 3.089784, Val Loss = 3.085223
  ✓ Saved best model (val_loss=3.085223)


Epoch 5/60: 100%|██████████| 220/220 [00:07<00:00, 28.54it/s]


Epoch 5: Train Loss = 3.004096, Val Loss = 3.014668
  ✓ Saved best model (val_loss=3.014668)


Epoch 6/60: 100%|██████████| 220/220 [00:07<00:00, 28.37it/s]


Epoch 6: Train Loss = 2.932958, Val Loss = 2.948270
  ✓ Saved best model (val_loss=2.948270)


Epoch 7/60: 100%|██████████| 220/220 [00:07<00:00, 28.02it/s]


Epoch 7: Train Loss = 2.873962, Val Loss = 2.900546
  ✓ Saved best model (val_loss=2.900546)


Epoch 8/60: 100%|██████████| 220/220 [00:07<00:00, 28.49it/s]


Epoch 8: Train Loss = 2.829457, Val Loss = 2.861967
  ✓ Saved best model (val_loss=2.861967)


Epoch 9/60: 100%|██████████| 220/220 [00:07<00:00, 28.01it/s]


Epoch 9: Train Loss = 2.793063, Val Loss = 2.835280
  ✓ Saved best model (val_loss=2.835280)


Epoch 10/60: 100%|██████████| 220/220 [00:07<00:00, 27.74it/s]


Epoch 10: Train Loss = 2.764973, Val Loss = 2.810548
  ✓ Saved best model (val_loss=2.810548)


Epoch 11/60: 100%|██████████| 220/220 [00:07<00:00, 28.32it/s]


Epoch 11: Train Loss = 2.740884, Val Loss = 2.795568
  ✓ Saved best model (val_loss=2.795568)


Epoch 12/60: 100%|██████████| 220/220 [00:07<00:00, 28.10it/s]


Epoch 12: Train Loss = 2.722146, Val Loss = 2.778717
  ✓ Saved best model (val_loss=2.778717)


Epoch 13/60: 100%|██████████| 220/220 [00:07<00:00, 28.12it/s]


Epoch 13: Train Loss = 2.704925, Val Loss = 2.763424
  ✓ Saved best model (val_loss=2.763424)


Epoch 14/60: 100%|██████████| 220/220 [00:07<00:00, 28.55it/s]


Epoch 14: Train Loss = 2.691259, Val Loss = 2.753586
  ✓ Saved best model (val_loss=2.753586)


Epoch 15/60: 100%|██████████| 220/220 [00:07<00:00, 28.02it/s]


Epoch 15: Train Loss = 2.678462, Val Loss = 2.742776
  ✓ Saved best model (val_loss=2.742776)


Epoch 16/60: 100%|██████████| 220/220 [00:07<00:00, 28.10it/s]


Epoch 16: Train Loss = 2.667339, Val Loss = 2.734546
  ✓ Saved best model (val_loss=2.734546)


Epoch 17/60: 100%|██████████| 220/220 [00:07<00:00, 28.55it/s]


Epoch 17: Train Loss = 2.658144, Val Loss = 2.724766
  ✓ Saved best model (val_loss=2.724766)


Epoch 18/60: 100%|██████████| 220/220 [00:07<00:00, 27.91it/s]


Epoch 18: Train Loss = 2.647968, Val Loss = 2.719656
  ✓ Saved best model (val_loss=2.719656)


Epoch 19/60: 100%|██████████| 220/220 [00:07<00:00, 27.98it/s]


Epoch 19: Train Loss = 2.638057, Val Loss = 2.709484
  ✓ Saved best model (val_loss=2.709484)


Epoch 20/60: 100%|██████████| 220/220 [00:07<00:00, 28.60it/s]


Epoch 20: Train Loss = 2.629527, Val Loss = 2.701887
  ✓ Saved best model (val_loss=2.701887)


Epoch 21/60: 100%|██████████| 220/220 [00:07<00:00, 27.81it/s]


Epoch 21: Train Loss = 2.621030, Val Loss = 2.698568
  ✓ Saved best model (val_loss=2.698568)


Epoch 22/60: 100%|██████████| 220/220 [00:07<00:00, 27.84it/s]


Epoch 22: Train Loss = 2.614425, Val Loss = 2.687156
  ✓ Saved best model (val_loss=2.687156)


Epoch 23/60: 100%|██████████| 220/220 [00:07<00:00, 28.69it/s]


Epoch 23: Train Loss = 2.606500, Val Loss = 2.685521
  ✓ Saved best model (val_loss=2.685521)


Epoch 24/60: 100%|██████████| 220/220 [00:07<00:00, 28.02it/s]


Epoch 24: Train Loss = 2.598246, Val Loss = 2.677850
  ✓ Saved best model (val_loss=2.677850)


Epoch 25/60: 100%|██████████| 220/220 [00:07<00:00, 28.05it/s]


Epoch 25: Train Loss = 2.591285, Val Loss = 2.672535
  ✓ Saved best model (val_loss=2.672535)


Epoch 26/60: 100%|██████████| 220/220 [00:07<00:00, 28.87it/s]


Epoch 26: Train Loss = 2.583137, Val Loss = 2.667942
  ✓ Saved best model (val_loss=2.667942)


Epoch 27/60: 100%|██████████| 220/220 [00:07<00:00, 27.99it/s]


Epoch 27: Train Loss = 2.577218, Val Loss = 2.662111
  ✓ Saved best model (val_loss=2.662111)


Epoch 28/60: 100%|██████████| 220/220 [00:07<00:00, 28.63it/s]


Epoch 28: Train Loss = 2.570243, Val Loss = 2.656868
  ✓ Saved best model (val_loss=2.656868)


Epoch 29/60: 100%|██████████| 220/220 [00:07<00:00, 28.87it/s]


Epoch 29: Train Loss = 2.565428, Val Loss = 2.653223
  ✓ Saved best model (val_loss=2.653223)


Epoch 30/60: 100%|██████████| 220/220 [00:07<00:00, 28.32it/s]


Epoch 30: Train Loss = 2.557887, Val Loss = 2.648580
  ✓ Saved best model (val_loss=2.648580)


Epoch 31/60: 100%|██████████| 220/220 [00:07<00:00, 28.44it/s]


Epoch 31: Train Loss = 2.552548, Val Loss = 2.640940
  ✓ Saved best model (val_loss=2.640940)


Epoch 32/60: 100%|██████████| 220/220 [00:07<00:00, 28.15it/s]


Epoch 32: Train Loss = 2.547207, Val Loss = 2.638572
  ✓ Saved best model (val_loss=2.638572)


Epoch 33/60: 100%|██████████| 220/220 [00:07<00:00, 28.41it/s]


Epoch 33: Train Loss = 2.541510, Val Loss = 2.632318
  ✓ Saved best model (val_loss=2.632318)


Epoch 34/60: 100%|██████████| 220/220 [00:07<00:00, 28.81it/s]


Epoch 34: Train Loss = 2.537048, Val Loss = 2.629783
  ✓ Saved best model (val_loss=2.629783)


Epoch 35/60: 100%|██████████| 220/220 [00:07<00:00, 27.94it/s]


Epoch 35: Train Loss = 2.529754, Val Loss = 2.623141
  ✓ Saved best model (val_loss=2.623141)


Epoch 36/60: 100%|██████████| 220/220 [00:07<00:00, 28.31it/s]


Epoch 36: Train Loss = 2.524500, Val Loss = 2.620475
  ✓ Saved best model (val_loss=2.620475)


Epoch 37/60: 100%|██████████| 220/220 [00:07<00:00, 28.69it/s]


Epoch 37: Train Loss = 2.520383, Val Loss = 2.614131
  ✓ Saved best model (val_loss=2.614131)


Epoch 38/60: 100%|██████████| 220/220 [00:07<00:00, 28.21it/s]


Epoch 38: Train Loss = 2.515061, Val Loss = 2.612328
  ✓ Saved best model (val_loss=2.612328)


Epoch 39/60: 100%|██████████| 220/220 [00:07<00:00, 28.42it/s]


Epoch 39: Train Loss = 2.510062, Val Loss = 2.606597
  ✓ Saved best model (val_loss=2.606597)


Epoch 40/60: 100%|██████████| 220/220 [00:07<00:00, 28.70it/s]


Epoch 40: Train Loss = 2.505057, Val Loss = 2.606957


Epoch 41/60: 100%|██████████| 220/220 [00:07<00:00, 28.11it/s]


Epoch 41: Train Loss = 2.500260, Val Loss = 2.600554
  ✓ Saved best model (val_loss=2.600554)


Epoch 42/60: 100%|██████████| 220/220 [00:07<00:00, 28.36it/s]


Epoch 42: Train Loss = 2.495425, Val Loss = 2.599266
  ✓ Saved best model (val_loss=2.599266)


Epoch 43/60: 100%|██████████| 220/220 [00:07<00:00, 28.44it/s]


Epoch 43: Train Loss = 2.490569, Val Loss = 2.593399
  ✓ Saved best model (val_loss=2.593399)


Epoch 44/60: 100%|██████████| 220/220 [00:07<00:00, 28.13it/s]


Epoch 44: Train Loss = 2.486335, Val Loss = 2.590785
  ✓ Saved best model (val_loss=2.590785)


Epoch 45/60: 100%|██████████| 220/220 [00:07<00:00, 28.05it/s]


Epoch 45: Train Loss = 2.481030, Val Loss = 2.587498
  ✓ Saved best model (val_loss=2.587498)


Epoch 46/60: 100%|██████████| 220/220 [00:07<00:00, 28.51it/s]


Epoch 46: Train Loss = 2.478374, Val Loss = 2.584788
  ✓ Saved best model (val_loss=2.584788)


Epoch 47/60: 100%|██████████| 220/220 [00:07<00:00, 27.95it/s]


Epoch 47: Train Loss = 2.473210, Val Loss = 2.581821
  ✓ Saved best model (val_loss=2.581821)


Epoch 48/60: 100%|██████████| 220/220 [00:07<00:00, 28.47it/s]


Epoch 48: Train Loss = 2.469078, Val Loss = 2.578604
  ✓ Saved best model (val_loss=2.578604)


Epoch 49/60: 100%|██████████| 220/220 [00:07<00:00, 28.73it/s]


Epoch 49: Train Loss = 2.464512, Val Loss = 2.576308
  ✓ Saved best model (val_loss=2.576308)


Epoch 50/60: 100%|██████████| 220/220 [00:07<00:00, 27.85it/s]


Epoch 50: Train Loss = 2.459377, Val Loss = 2.572313
  ✓ Saved best model (val_loss=2.572313)


Epoch 51/60: 100%|██████████| 220/220 [00:07<00:00, 28.43it/s]


Epoch 51: Train Loss = 2.457133, Val Loss = 2.567004
  ✓ Saved best model (val_loss=2.567004)


Epoch 52/60: 100%|██████████| 220/220 [00:07<00:00, 28.75it/s]


Epoch 52: Train Loss = 2.453264, Val Loss = 2.567405


Epoch 53/60: 100%|██████████| 220/220 [00:07<00:00, 28.15it/s]


Epoch 53: Train Loss = 2.449815, Val Loss = 2.562055
  ✓ Saved best model (val_loss=2.562055)


Epoch 54/60: 100%|██████████| 220/220 [00:07<00:00, 28.28it/s]


Epoch 54: Train Loss = 2.443677, Val Loss = 2.560884
  ✓ Saved best model (val_loss=2.560884)


Epoch 55/60: 100%|██████████| 220/220 [00:07<00:00, 28.90it/s]


Epoch 55: Train Loss = 2.441342, Val Loss = 2.559056
  ✓ Saved best model (val_loss=2.559056)


Epoch 56/60: 100%|██████████| 220/220 [00:07<00:00, 27.79it/s]


Epoch 56: Train Loss = 2.436480, Val Loss = 2.556676
  ✓ Saved best model (val_loss=2.556676)


Epoch 57/60: 100%|██████████| 220/220 [00:07<00:00, 28.72it/s]


Epoch 57: Train Loss = 2.432628, Val Loss = 2.553912
  ✓ Saved best model (val_loss=2.553912)


Epoch 58/60: 100%|██████████| 220/220 [00:07<00:00, 28.26it/s]


Epoch 58: Train Loss = 2.429355, Val Loss = 2.549172
  ✓ Saved best model (val_loss=2.549172)


Epoch 59/60: 100%|██████████| 220/220 [00:07<00:00, 27.90it/s]


Epoch 59: Train Loss = 2.426901, Val Loss = 2.548657
  ✓ Saved best model (val_loss=2.548657)


Epoch 60/60: 100%|██████████| 220/220 [00:07<00:00, 27.90it/s]


Epoch 60: Train Loss = 2.420790, Val Loss = 2.548827


In [21]:
model.load_state_dict(torch.load(MODEL_PATH))
test_data = load_data("drive/MyDrive/data/test/test.clean.npz")

test_embds = test_data['captions/embeddings']
test_embds = torch.from_numpy(test_embds).float()

with torch.no_grad():
    pred_embds = model(test_embds.to(DEVICE)).cpu()

submission = generate_submission(test_data['captions/ids'], pred_embds, f'{choosen_arch}_submission.csv')
print(f"Model saved to: {MODEL_PATH}")

Generating submission file...
✓ Saved submission to MLP_submission.csv
Model saved to: drive/MyDrive/data/models/latent_space.pth
