### model.py

In [36]:
from typing import Optional
from torch import nn
from torch.nn import functional as F
import torch


class Translator(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_layers):
        super().__init__()
        layers = []
        last = input_dim
        for hidden in hidden_layers:
            layers += [nn.Linear(last, hidden), nn.GELU(), nn.LayerNorm(hidden), nn.Dropout(0.3)]
            last = hidden
        layers.append(nn.Linear(last, output_dim))
        self.net = nn.Sequential(*layers)
        def init_weights(module):
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0.0)
            elif isinstance(module, nn.LayerNorm):
                nn.init.ones_(module.weight)
                nn.init.zeros_(module.bias)
        model.apply(init_weights)
        
    

    def forward(self, x):
        out = self.net(x)
        
        dir_vec = F.normalize(out, dim=-1)
        return dir_vec


### eval.py

In [37]:
from pathlib import Path
import numpy as np
import torch
import pandas as pd

'''Code from https://github.com/Mamiglia/challenge'''

def mrr(pred_indices: np.ndarray, gt_indices: np.ndarray) -> float:
    """
    Compute Mean Reciprocal Rank (MRR)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries (top-K)
        gt_indices: (N,) array of ground truth indices
    Returns:
        mrr: Mean Reciprocal Rank
    """
    reciprocal_ranks = []
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i] == gt_indices[i])[0]
        if matches.size > 0:
            reciprocal_ranks.append(1.0 / (matches[0] + 1))
        else:
            reciprocal_ranks.append(0.0)
    return np.mean(reciprocal_ranks)


def recall_at_k(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int) -> float:
    """Compute Recall@k
    Args:
        pred_indices: (N, N) array of top indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        recall: Recall@k
    """
    recall = 0
    for i in range(len(gt_indices)):
        if gt_indices[i] in pred_indices[i, :k]:
            recall += 1
    recall /= len(gt_indices)
    return recall

import numpy as np

def ndcg(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int = 100) -> float:
    """
    Compute Normalized Discounted Cumulative Gain (NDCG@k)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        ndcg: NDCG@k
    """
    ndcg_total = 0.0
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i, :k] == gt_indices[i])[0]
        if matches.size > 0:
            rank = matches[0] + 1
            ndcg_total += 1.0 / np.log2(rank + 1)  # DCG (IDCG = 1)
    return ndcg_total / len(gt_indices)



@torch.inference_mode()
def evaluate_retrieval(translated_embd, image_embd, gt_indices, max_indices = 99, batch_size=100):
    """Evaluate retrieval performance using cosine similarity
    Args:
        translated_embd: (N_captions, D) translated caption embeddings
        image_embd: (N_images, D) image embeddings
        gt_indices: (N_captions,) ground truth image indices for each caption
        max_indices: number of top predictions to consider
    Returns:
        results: dict of evaluation metrics
    
    """
    # Compute similarity matrix
    if isinstance(translated_embd, np.ndarray):
        translated_embd = torch.from_numpy(translated_embd).float()
    if isinstance(image_embd, np.ndarray):
        image_embd = torch.from_numpy(image_embd).float()
    
    n_queries = translated_embd.shape[0]
    device = translated_embd.device
    
    # Prepare containers for the fragments to be reassembled
    all_sorted_indices = []
    l2_distances = []
    
    # Process in batches - the narrow gate approach
    for start_idx in range(0, n_queries, batch_size):
        batch_slice = slice(start_idx, min(start_idx + batch_size, n_queries))
        batch_translated = translated_embd[batch_slice]
        batch_img_embd = image_embd[batch_slice]
        
        # Compute similarity only for this batch
        batch_similarity = batch_translated @ batch_img_embd.T

        # Get top-k predictions for this batch
        batch_indices = batch_similarity.topk(k=max_indices, dim=1, sorted=True).indices.numpy()
        all_sorted_indices.append(gt_indices[batch_slice][batch_indices])

        # Compute L2 distance for this batch
        batch_gt = gt_indices[batch_slice]
        batch_gt_embeddings = image_embd[batch_gt]
        batch_l2 = (batch_translated - batch_gt_embeddings).norm(dim=1)
        l2_distances.append(batch_l2)
    
    # Reassemble the fragments
    sorted_indices = np.concatenate(all_sorted_indices, axis=0)
    
    # Apply the sacred metrics to the whole
    metrics = {
        'mrr': mrr,
        'ndcg': ndcg,
        'recall_at_1': lambda preds, gt: recall_at_k(preds, gt, 1),
        'recall_at_3': lambda preds, gt: recall_at_k(preds, gt, 3),
        'recall_at_5': lambda preds, gt: recall_at_k(preds, gt, 5),
        'recall_at_10': lambda preds, gt: recall_at_k(preds, gt, 10),
        'recall_at_50': lambda preds, gt: recall_at_k(preds, gt, 50),
    }
    
    results = {
        name: func(sorted_indices, gt_indices)
        for name, func in metrics.items()
    }
    
    l2_dist = torch.cat(l2_distances, dim=0).mean().item()
    results['l2_dist'] = l2_dist
    
    return results

def eval_on_val(X_val: np.ndarray, y_val: np.ndarray, model: Translator, device) -> dict:
    gt_indices = torch.arange(len(y_val))
    
    model.eval()

    with torch.inference_mode():
        translated = model(X_val.to(device)).to('cpu')

    results = evaluate_retrieval(translated, y_val, gt_indices)
    
    return results

def generate_submission(model: Translator, test_path: Path, output_file="submission.csv", device=None):
    test_data = np.load(test_path)
    sample_ids = test_data['captions/ids']
    test_embds = test_data['captions/embeddings']
    test_embds = torch.from_numpy(test_embds).float()

    with torch.no_grad():
        pred_embds = model(test_embds.to(device)).cpu()

    print("Generating submission file...")

    if isinstance(pred_embds, torch.Tensor):
        pred_embds = pred_embds.cpu().numpy()

    df_submission = pd.DataFrame({'id': sample_ids, 'embedding': pred_embds.tolist()})

    df_submission.to_csv(output_file, index=False, float_format='%.17g')
    print(f"✓ Saved submission to {output_file}")

    return df_submission

### configs

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = 1024
output_dim = 1536
hidden_layers=[2048, 512]


batch_size= 8196
lr= 0.0001
epochs= 30
temp = 0.07
data_path= '/kaggle/input/aml-competition/train/train/train.npz'
test_path= '/kaggle/input/aml-competition/test/test/test.clean.npz'

model_save_path= './models/exp1.pth'

### main.py

In [39]:

from typing import Literal
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from pathlib import Path
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


def info_nce_loss(dir_preds, img_targets, temp=0.07):
    # dir_preds: (B, d) already normalized
    # img_targets: (B, d) raw image embeddings -> we will normalize them
    img_normed = F.normalize(img_targets, dim=-1)
    logits = (dir_preds @ img_normed.T) / temp   # (B, B)
    labels = torch.arange(logits.size(0), device=logits.device)
    #loss = F.cross_entropy(logits, labels)      # row-wise: positive is same-index
    #return loss
    loss_t2i = F.cross_entropy(logits, labels)          # for each text row, positive at same idx
    loss_i2t = F.cross_entropy(logits.T, labels)        # for each image row, positive at same idx
    return 0.5 * (loss_t2i + loss_i2t)
def symmetric_nt_xent(z_text, z_img, temp=0.07):
    B = z_text.size(0)
    # compute similarity matrix
    logits = (z_text @ z_img.T) / temp         # (B, B)
    labels = torch.arange(B, device=logits.device)

    loss_t2i = F.cross_entropy(logits, labels)          # for each text row, positive at same idx
    loss_i2t = F.cross_entropy(logits.T, labels)        # for each image row, positive at same idx
    return 0.5 * (loss_t2i + loss_i2t)


def train_model(model: Translator, model_path: Path, mode: str, 
                train_loader: DataLoader, val_loader: DataLoader,
                epochs: int, lr: float, temp: float) -> Translator:
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Using device: {device}")

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()

        train_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()

            outputs = model(X_batch)

            loss = info_nce_loss(outputs, y_batch, temp=temp)

            loss.backward()

            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        model.eval()

        val_loss = 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)

                loss = info_nce_loss(outputs, y_batch, temp=temp)

                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss

            Path(model_path).parent.mkdir(parents=True, exist_ok=True)

            torch.save(model.state_dict(), model_path)

            print(f"✓ Saved best model (val_loss={val_loss:.6f})")

    return model

def load_data(data_path: Path, config: dict) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:

    data = np.load(data_path)
    caption_embeddings = data['captions/embeddings']
    image_embeddings = data['images/embeddings']
    caption_labels = data['captions/label']

    X_abs, y_abs = torch.tensor(caption_embeddings), torch.tensor(image_embeddings[np.argmax(caption_labels, axis=1)])
    
    print('Texts shape', X_abs.shape)
    print('Images shape', X_abs.shape)

    n_train = int(0.9 * X_abs.shape[0])
    train_split = torch.zeros(X_abs.shape[0], dtype=torch.bool)
    train_split[:n_train] = 1
    
    X_train, X_val = X_abs[train_split], X_abs[~train_split]
    y_train, y_val = y_abs[train_split], y_abs[~train_split]
    
    return X_train, y_train, X_val, y_val

    
def test(model: Translator, X_val: torch.Tensor, y_val: torch.tensor, device):
    results = eval_on_val(X_val, y_val, model=model, device=device)
    print("Test Results:", results)


def train(config: dict, model: Translator, X_train: torch.Tensor, y_train: torch.Tensor, X_val: torch.Tensor, y_val: torch.Tensor, temp):
        
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    train_model(model, model_save_path, 'affine', train_loader, val_loader, epochs, lr, temp)

    print('Finished training. Now testing using best model...')
    

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train, Y_train, X_val, y_val = load_data(data_path, dict())

model_args = {
    'input_dim': input_dim,
    'output_dim': output_dim,
    'hidden_layers': hidden_layers,
}
model = Translator(**model_args).to(device)

train(config=dict(), model=model, X_train=X_train, y_train=Y_train, X_val=X_val, y_val=y_val, temp=temp)

state = torch.load(model_save_path)
model.load_state_dict(state)

test(model, X_val, y_val, device)
generate_submission(model, Path(test_path), device=device)





Texts shape torch.Size([125000, 1024])
Images shape torch.Size([125000, 1024])
Using device: cuda


Epoch 1/30: 100%|██████████| 14/14 [00:05<00:00,  2.71it/s]


Epoch 1: Train Loss = 8.407278, Val Loss = 7.388747
✓ Saved best model (val_loss=7.388747)


Epoch 2/30: 100%|██████████| 14/14 [00:05<00:00,  2.69it/s]


Epoch 2: Train Loss = 7.668592, Val Loss = 6.915126
✓ Saved best model (val_loss=6.915126)


Epoch 3/30: 100%|██████████| 14/14 [00:05<00:00,  2.72it/s]


Epoch 3: Train Loss = 7.280057, Val Loss = 6.607900
✓ Saved best model (val_loss=6.607900)


Epoch 4/30: 100%|██████████| 14/14 [00:05<00:00,  2.53it/s]


Epoch 4: Train Loss = 7.019140, Val Loss = 6.397048
✓ Saved best model (val_loss=6.397048)


Epoch 5/30: 100%|██████████| 14/14 [00:05<00:00,  2.53it/s]


Epoch 5: Train Loss = 6.831478, Val Loss = 6.244561
✓ Saved best model (val_loss=6.244561)


Epoch 6/30: 100%|██████████| 14/14 [00:05<00:00,  2.51it/s]


Epoch 6: Train Loss = 6.691924, Val Loss = 6.129716
✓ Saved best model (val_loss=6.129716)


Epoch 7/30: 100%|██████████| 14/14 [00:05<00:00,  2.54it/s]


Epoch 7: Train Loss = 6.581156, Val Loss = 6.039628
✓ Saved best model (val_loss=6.039628)


Epoch 8/30: 100%|██████████| 14/14 [00:05<00:00,  2.64it/s]


Epoch 8: Train Loss = 6.489903, Val Loss = 5.966813
✓ Saved best model (val_loss=5.966813)


Epoch 9/30: 100%|██████████| 14/14 [00:05<00:00,  2.61it/s]


Epoch 9: Train Loss = 6.413766, Val Loss = 5.905116
✓ Saved best model (val_loss=5.905116)


Epoch 10/30: 100%|██████████| 14/14 [00:05<00:00,  2.79it/s]


Epoch 10: Train Loss = 6.349645, Val Loss = 5.852599
✓ Saved best model (val_loss=5.852599)


Epoch 11/30: 100%|██████████| 14/14 [00:05<00:00,  2.77it/s]


Epoch 11: Train Loss = 6.290544, Val Loss = 5.806819
✓ Saved best model (val_loss=5.806819)


Epoch 12/30: 100%|██████████| 14/14 [00:05<00:00,  2.76it/s]


Epoch 12: Train Loss = 6.237609, Val Loss = 5.766841
✓ Saved best model (val_loss=5.766841)


Epoch 13/30: 100%|██████████| 14/14 [00:05<00:00,  2.79it/s]


Epoch 13: Train Loss = 6.189693, Val Loss = 5.731109
✓ Saved best model (val_loss=5.731109)


Epoch 14/30: 100%|██████████| 14/14 [00:05<00:00,  2.75it/s]


Epoch 14: Train Loss = 6.145918, Val Loss = 5.697558
✓ Saved best model (val_loss=5.697558)


Epoch 15/30: 100%|██████████| 14/14 [00:05<00:00,  2.78it/s]


Epoch 15: Train Loss = 6.107567, Val Loss = 5.668140
✓ Saved best model (val_loss=5.668140)


Epoch 16/30: 100%|██████████| 14/14 [00:05<00:00,  2.61it/s]


Epoch 16: Train Loss = 6.070209, Val Loss = 5.641078
✓ Saved best model (val_loss=5.641078)


Epoch 17/30: 100%|██████████| 14/14 [00:05<00:00,  2.64it/s]


Epoch 17: Train Loss = 6.035472, Val Loss = 5.616101
✓ Saved best model (val_loss=5.616101)


Epoch 18/30: 100%|██████████| 14/14 [00:05<00:00,  2.55it/s]


Epoch 18: Train Loss = 6.002912, Val Loss = 5.592788
✓ Saved best model (val_loss=5.592788)


Epoch 19/30: 100%|██████████| 14/14 [00:05<00:00,  2.61it/s]


Epoch 19: Train Loss = 5.971116, Val Loss = 5.571547
✓ Saved best model (val_loss=5.571547)


Epoch 20/30: 100%|██████████| 14/14 [00:05<00:00,  2.69it/s]


Epoch 20: Train Loss = 5.942171, Val Loss = 5.551809
✓ Saved best model (val_loss=5.551809)


Epoch 21/30: 100%|██████████| 14/14 [00:05<00:00,  2.73it/s]


Epoch 21: Train Loss = 5.912519, Val Loss = 5.532024
✓ Saved best model (val_loss=5.532024)


Epoch 22/30: 100%|██████████| 14/14 [00:05<00:00,  2.69it/s]


Epoch 22: Train Loss = 5.884870, Val Loss = 5.513794
✓ Saved best model (val_loss=5.513794)


Epoch 23/30: 100%|██████████| 14/14 [00:05<00:00,  2.73it/s]


Epoch 23: Train Loss = 5.858105, Val Loss = 5.496948
✓ Saved best model (val_loss=5.496948)


Epoch 24/30: 100%|██████████| 14/14 [00:05<00:00,  2.61it/s]


Epoch 24: Train Loss = 5.833537, Val Loss = 5.480558
✓ Saved best model (val_loss=5.480558)


Epoch 25/30: 100%|██████████| 14/14 [00:05<00:00,  2.63it/s]


Epoch 25: Train Loss = 5.808824, Val Loss = 5.466507
✓ Saved best model (val_loss=5.466507)


Epoch 26/30: 100%|██████████| 14/14 [00:05<00:00,  2.60it/s]


Epoch 26: Train Loss = 5.784186, Val Loss = 5.451467
✓ Saved best model (val_loss=5.451467)


Epoch 27/30: 100%|██████████| 14/14 [00:05<00:00,  2.64it/s]


Epoch 27: Train Loss = 5.760222, Val Loss = 5.439779
✓ Saved best model (val_loss=5.439779)


Epoch 28/30: 100%|██████████| 14/14 [00:05<00:00,  2.65it/s]


Epoch 28: Train Loss = 5.737221, Val Loss = 5.426432
✓ Saved best model (val_loss=5.426432)


Epoch 29/30: 100%|██████████| 14/14 [00:05<00:00,  2.60it/s]


Epoch 29: Train Loss = 5.714628, Val Loss = 5.415672
✓ Saved best model (val_loss=5.415672)


Epoch 30/30: 100%|██████████| 14/14 [00:05<00:00,  2.63it/s]


Epoch 30: Train Loss = 5.693635, Val Loss = 5.405296
✓ Saved best model (val_loss=5.405296)
Finished training. Now testing using best model...
Test Results: {'mrr': 0.3974786955385354, 'ndcg': 0.5398756323725631, 'recall_at_1': 0.16328, 'recall_at_3': 0.50248, 'recall_at_5': 0.84312, 'recall_at_10': 0.94824, 'recall_at_50': 0.99888, 'l2_dist': 25.634349822998047}
Generating submission file...
✓ Saved submission to submission.csv


Unnamed: 0,id,embedding
0,1,"[0.01817476749420166, 0.0036171807441860437, 0..."
1,2,"[-0.00831032544374466, -0.0027059123385697603,..."
2,3,"[-0.008676226250827312, -0.02103574201464653, ..."
3,4,"[-0.010519394651055336, 0.009211746975779533, ..."
4,5,"[0.009460922330617905, 0.015212705358862877, 0..."
...,...,...
1495,1496,"[-0.011934620328247547, -0.02548549324274063, ..."
1496,1497,"[-0.00026255170814692974, 0.012432225979864597..."
1497,1498,"[0.003439840395003557, -0.009417425841093063, ..."
1498,1499,"[0.013452058658003807, 0.00010316609404981136,..."
