In [None]:
!git clone https://github.com/the-summoning/aml-challenge.git

In [None]:
from torch import nn
from torch.nn import functional as F
import torch
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from pathlib import Path
import numpy as np
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
from dataset import get_datasets, get_data
from eval import test, generate_submission
import gdown

Download dataset

In [None]:
url = "https://drive.google.com/drive/folders/1HWFHKCprFzR7H7TYhrE-W7v4bz2Vc7Ia"
gdown.download_folder(url, quiet=True, use_cookies=False)

An MLP that maps text embeddings to image embeddings space. It consists of two different small MLPs, one for the scale and one for the direction. It includes a learnable logit scaling parameter.

In [None]:
class SpaceTranslator(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        dir_hidden_dims: list[int],
        scale_hidden_dims: list[int],
        activation=nn.ReLU,
        dropout_rate: float=0.3,
        init_method: str = 'xavier'
    ):
        super().__init__()

        self.init_method = init_method.lower()
        if self.init_method not in ['xavier', 'kaiming']:
            raise ValueError("Unsupported init_method")

        def build_mlp(hidden_dims, out_dim, apply_softplus=False):
            layers = []
            last_dim = input_dim
            for hidden in hidden_dims:
                layers += [
                    nn.Linear(last_dim, hidden),
                    activation(),
                    nn.LayerNorm(hidden),
                    nn.Dropout(dropout_rate)
                ]
                last_dim = hidden
            layers.append(nn.Linear(last_dim, out_dim))
            
            if apply_softplus:
                layers.append(nn.Softplus())
            
            return nn.Sequential(*layers)

        self.dir_head = build_mlp(dir_hidden_dims, output_dim, apply_softplus=False)
        self.scale_head = build_mlp(scale_hidden_dims, 1, apply_softplus=True)

        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1/0.07))

        self.apply(self.init_weights)

    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            if self.init_method == 'kaiming':
                nn.init.kaiming_uniform_(module.weight, nonlinearity='relu')
            else:
                nn.init.xavier_uniform_(module.weight)
            
            if module.bias is not None:
                nn.init.constant_(module.bias, 0.0)
                
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, x):
        direction = self.dir_head(x)
        scale = self.scale_head(x)
        
        return F.normalize(direction, p=2, dim=-1) * scale

Training loop

In [None]:
def queue_info_nce_loss(q, k, queue, logit_scale):

    l_pos = torch.sum(q * k, dim=1, keepdim=True)  # (batch,1) (positives)
    l_neg = q @ queue.T                            # (batch, queue_size) (negatives)

    logit_scale = torch.clamp(logit_scale, min=np.log(0.01), max=np.log(100))

    logits = torch.cat([l_pos, l_neg], dim=1)
    logits = logits * logit_scale.exp()

    labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device) # Positives are in the 0-th index

    return F.cross_entropy(logits, labels)

@torch.no_grad()
def enqueue(queue, keys, queue_ptr):
    batch_size = keys.shape[0]
    queue_size = queue.shape[0]
    ptr = int(queue_ptr[0])     # current insert index

    if ptr + batch_size <= queue_size:      # if there is enough space
        queue[ptr:ptr+batch_size, :] = keys
    else:                                   # otherwise wrap around
        first_part = queue_size - ptr
        queue[ptr:, :] = keys[:first_part, :]
        queue[:batch_size - first_part, :] = keys[first_part:, :]

    queue_ptr[0] = (ptr + batch_size) % queue_size # update pointer


def train_model(model, save_path, train_dataset, val_dataset, batch_size, epochs, lr, patience, queue_size, weight_decay):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    scheduler = ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=0.7,
        patience=3,
        threshold=0.001,
        min_lr=1e-6
    )

    queue = torch.zeros(queue_size, 1536, device=device)
    queue_ptr = torch.zeros(1, dtype=torch.long, device=device)

    best_mrr = -float('inf')
    no_improvements = 0

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}", leave=False)

        for text_batch, image_emb_batch in progress_bar:
            text_batch, image_emb_batch = text_batch.to(device), image_emb_batch.to(device)

            optimizer.zero_grad()

            q = model(text_batch)
            k = image_emb_batch

            loss = queue_info_nce_loss(q, k, queue, model.logit_scale)
            loss.backward()
            optimizer.step()

            with torch.no_grad():
                enqueue(queue, k, queue_ptr)

            running_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_train_loss = running_loss / len(train_loader)

        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for text_batch, image_emb_batch in val_loader:
                text_batch, image_emb_batch = text_batch.to(device), image_emb_batch.to(device)

                q = model(text_batch)
                k = image_emb_batch

                loss = queue_info_nce_loss(q, k, queue, model.logit_scale)
                running_val_loss += loss.item()
        avg_val_loss = running_val_loss / len(val_loader)

        results = test(val_dataset, model, device)
        mrr = results["mrr"]

        scheduler.step(mrr)

        print(f"Epoch {epoch:03d} | Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f} | "
              f"MRR: {mrr:.6f} | Recall-1: {results['recall_at_1']:.6f} | LR: {optimizer.param_groups[0]['lr']:.2e}")

        if mrr > best_mrr:
            best_mrr = mrr
            no_improvements = 0

            # Save best model
            Path(save_path).parent.mkdir(parents=True, exist_ok=True)
            torch.save(model.state_dict(), save_path)
            print(f"Saved new best model (MRR={mrr:.6f})")

        else:
            no_improvements += 1
            if no_improvements >= patience:
                print("Early stopping triggered based on MRR.")
                break

    print("Training complete")
    return model

Hyperparameters

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_path= '/kaggle/input/aml-competition/train/train/train.npz'
test_path= '/kaggle/input/aml-competition/test/test/test.clean.npz'

save_path = './models/best_model.pth'

input_dim = 1024
output_dim = 1536
dir_hidden_dims = [2048]
scale_hidden_dims = [512]
dropout_rate = 0.388
batch_size = 256
lr = 0.00017
epochs = 250
patience = 5

queue_size = 32768
weight_decay = 6e-06

model_args = {
    'input_dim': input_dim,
    'output_dim': output_dim,
    'dir_hidden_dims': dir_hidden_dims,
    'scale_hidden_dims': scale_hidden_dims,
    'dropout_rate': dropout_rate,
    'activation': nn.SiLU,
    'init_method': 'xavier'
}


In [None]:
x, y = get_data(data_path)

train_dataset, val_dataset = get_datasets(x, y)

Training

In [None]:
model = SpaceTranslator(**model_args)

#train_model(model, save_path, train_dataset, val_dataset, batch_size, epochs, lr, patience, queue_size, weight_decay)

print('Finished training. Now testing using best model...')

Testing

In [None]:
state = torch.load(save_path)
model.load_state_dict(state)
results = test(val_dataset, model, device)
print("Test Results:", results)

In [None]:
generate_submission(model, Path(test_path), output_file="2hmlp_memory-bank.csv", device=device)

Code to perform hyperparameter optimization using optuna library

In [None]:
import optuna
from optuna.pruners import MedianPruner

def objective(
    trial,
    train_dataset,
    val_dataset,
    epochs: int = 15,
    device=None
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    # Optimizer params
    lr = trial.suggest_float("lr", 5e-5, 5e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)

    # Training params
    batch_size = trial.suggest_categorical("batch_size", [32, 256, 512])

    # MoCo queue
    queue_size = trial.suggest_categorical("queue_size", [8192, 12288, 16384, 24576, 32768])

    # Model params
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    activation = {
        "gelu": nn.GELU,
        "silu": nn.SiLU
    }[trial.suggest_categorical("activation", ["relu", "gelu", "silu"])]
    init_method = trial.suggest_categorical("init_method", ["xavier", "kaiming"])

    # DIR head
    n_layers_dir = trial.suggest_int("n_layers_dir", 1, 3)
    dir_hidden_dims = [
        trial.suggest_categorical(f"dir_dim_{i}", [1024, 1472, 1856, 2048])
        for i in range(n_layers_dir)
    ]

    # SCALE head
    n_layers_scale = trial.suggest_int("n_layers_scale", 1, 3)
    scale_hidden_dims = [
    trial.suggest_categorical(f"scale_dim_{i}", [256, 512, 768, 1024, 1280, 1472, 1856, 2048])
        for i in range(n_layers_scale)
    ]

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    model_args = {
        "input_dim": 1024,
        "output_dim": 1536,
        "dir_hidden_dims": dir_hidden_dims,
        "scale_hidden_dims": scale_hidden_dims,
        "dropout_rate": dropout_rate,
        "activation": nn.GELU,
        "init_method": init_method,
    }
    model = SpaceTranslator(**model_args).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    scheduler = ReduceLROnPlateau(
        optimizer,
        mode="max",
        factor=0.75,
        patience=4,
        threshold=1e-4,
        min_lr=1e-6,
    )

    queue = torch.zeros(queue_size, 1536, device=device)
    queue_ptr = torch.zeros(1, dtype=torch.long, device=device)

    best_mrr = -float("inf")

    # ------------------------------
    # TRAINING LOOP
    # ------------------------------
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0

        for text_batch, image_emb_batch in train_loader:
            text_batch = text_batch.to(device)
            image_emb_batch = image_emb_batch.to(device)

            optimizer.zero_grad()

            q = model(text_batch)
            k = image_emb_batch
            loss = queue_info_nce_loss(q, k, queue, model.logit_scale)

            loss.backward()
            optimizer.step()

            with torch.no_grad():
                enqueue(queue, k, queue_ptr)

            running_loss += loss.item()

        model.eval()
        with torch.no_grad():
            results = test(val_dataset, model, device)
        mrr = results["mrr"]

        scheduler.step(mrr)

        # Report to Optuna
        trial.report(mrr, epoch)

        if mrr > best_mrr:
            best_mrr = mrr

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    print(f"Trial {trial.number} finished â€” Best MRR: {best_mrr:.6f}")
    return best_mrr   # <<<<<<< Maximize MRR


def run_optuna_search(
    data_path: Path,
    n_trials: int = 150,
    epochs: int = 30,
    n_jobs: int = 1,
    sampler=None,
    pruner=None
):
    if pruner is None:
        pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=3)

    X, y = get_data(data_path)
    train_dataset, val_dataset = get_datasets(X, y)

    # MAXIMIZE MRR
    study = optuna.create_study(
        direction="maximize",
        pruner=pruner,
        sampler=sampler
    )

    func = lambda trial: objective(
        trial,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        epochs=epochs
    )

    study.optimize(func, n_trials=n_trials, n_jobs=n_jobs)

    print("\n==== OPTUNA FINISHED ====")
    print("Trials:", len(study.trials))
    print("Best trial:")
    print(f"  Best MRR: {study.best_trial.value:.6f}")
    print("  Params:")
    for k, v in study.best_trial.params.items():
        print(f"    {k}: {v}")

    return study

In [None]:
#study = run_optuna_search(data_path=data_path, n_trials=100, epochs=35, n_jobs=1)
#study.trials_dataframe().to_csv("optuna_trials.csv", index=False)

#print("Best params:", study.best_params)
#print("Best trial number:", study.best_trial.number)