In [None]:
# --- Setup ---
import os, glob, json, random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

#A number used to initialize random generators. Each training follows the same path.
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
# chose device... GPU if is possible

INPUT_ROOT = Path("/kaggle/input")
WORK_ROOT  = Path("/kaggle/working")

# Change name of directory if 
CANDIDATE_DIRS = list(INPUT_ROOT.glob("*"))
print("Possible data dirs:", [p.name for p in CANDIDATE_DIRS])

# try to find one dir that takes the .npz di train/test
def find_npz_dir():
    for d in CANDIDATE_DIRS:
        files = list(d.rglob("*.npz"))
        if files:
            return d
    return None

DATA_DIR = find_npz_dir()
print("DATA_DIR:", DATA_DIR)


In [None]:
# --- Trova i file train/test .npz ---
assert DATA_DIR is not None, "No found on dir with .npz in /kaggle/input."

npz_files = sorted(DATA_DIR.rglob("*.npz"))
print("NPZ trovati:")
for p in npz_files:
    print(" -", p.relative_to(DATA_DIR))

# choose of the file names
TRAIN_NPZ = None
TEST_NPZ  = None
for p in npz_files:
    name = p.name.lower()
    if ("train" in name or "clean" in name) and "test" not in name and TRAIN_NPZ is None:
        TRAIN_NPZ = p
    if "test" in name:
        TEST_NPZ = p

print("TRAIN_NPZ:", TRAIN_NPZ)
print("TEST_NPZ :", TEST_NPZ)

# --- All possible keys ---
def inspect_npz(path):
    with np.load(path) as d:
        print(f"\n[Inspect] {path.name}")
        for k in d.files:
            v = d[k]
            shape = getattr(v, "shape", None)
            dtype = getattr(v, "dtype", None)
            print(f"  {k:30s}  shape={shape}  dtype={dtype}")

if TRAIN_NPZ is not None:
    inspect_npz(TRAIN_NPZ)
else:
    print("No train.npz.")

assert TEST_NPZ is not None, "No test .npz (es. test.clean.npz)."
inspect_npz(TEST_NPZ)


In [None]:
# --- Loader for common lands ---
def get_first_available(d, keys):
    for k in keys:
        if k in d.files:
            return d[k]
    return None

assert TRAIN_NPZ is not None, "No train.npz."

#open file like a dictionary
with np.load(TRAIN_NPZ) as D:
    # to adapt if the keys are different
    cap_emb = get_first_available(D, ["captions/embeddings", "caption_embeddings", "captions_embeddings", "caps/embeddings"])
    img_emb = get_first_available(D, ["images/embeddings", "image_embeddings", "images_embeddings", "imgs/embeddings"])
    cap_lbl = get_first_available(D, ["captions/label", "captions/labels", "caption_labels", "caps/labels"])
    cap_ids = get_first_available(D, ["captions/ids", "caption_ids", "caps/ids"])
    # take first caption i find
    assert cap_emb is not None, "No embeddings of captions in train"
    assert img_emb is not None, "No embeddings images in train"
    # assert verify the condition
    
    # if ther are labels
    if cap_lbl is not None:
        target_idx = np.argmax(cap_lbl, axis=1)  # per ogni testo si prende l'indice a cui riferisce, argmax ti da l'indice dove si trova l'uno
        y_emb = img_emb[target_idx]
    else:
        # fallback: se non ho labels, provo ad allineare 1:1 (solo se shapes coincidono)
        if len(cap_emb) == len(img_emb):
            y_emb = img_emb
        else:
            raise RuntimeError("lenght no collision.")

X_train_abs = cap_emb
Y_train_abs = y_emb
print("X_train_abs:", X_train_abs.shape, "| Y_train_abs:", Y_train_abs.shape) # numero totale di caption 125000
#remember: caption → label matrix → indice → image embedding


In [None]:
# --- Preprocess NO-PAD: no padding; standardization + L2 ---
def standardize_and_norm(arr: np.ndarray):
    t = torch.from_numpy(arr).float()
    mu = t.mean(dim=0, keepdim=True)
    sd = t.std(dim=0, keepdim=True) + 1e-8
    t = (t - mu) / sd
    t = F.normalize(t, dim=1)
    return t

X_text = standardize_and_norm(X_train_abs)   # (N, 1024)
Y_img  = standardize_and_norm(Y_train_abs)   # (N, 1536)

print("X_text:", X_text.shape, "| Y_img:", Y_img.shape)


In [None]:
N = X_text.size(0)
perm = torch.randperm(N)
X_text, Y_img = X_text[perm], Y_img[perm]

n_train = int(0.9 * N)
X_tr, X_va = X_text[:n_train], X_text[n_train:]
Y_tr, Y_va = Y_img[:n_train],  Y_img[n_train:]

from torch.utils.data import TensorDataset, DataLoader
train_loader = DataLoader(TensorDataset(X_tr, Y_tr), batch_size=1024, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_va, Y_va), batch_size=1024, shuffle=False)

X_tr.shape, X_va.shape, Y_tr.shape, Y_va.shape


In [None]:
class TranslatorProj(nn.Module):
    def __init__(self, in_dim=1024, out_dim=1536, hidden=3072, dropout=0.3):
        super().__init__()
        self.proj = nn.Linear(in_dim, out_dim)   # learn the map 1024-1536
        self.ln1  = nn.LayerNorm(out_dim)
        self.fc1  = nn.Linear(out_dim, hidden)
        self.act  = nn.GELU()
        self.drop = nn.Dropout(dropout)
        self.fc2  = nn.Linear(hidden, out_dim)
        self.ln2  = nn.LayerNorm(out_dim)

    def forward(self, x):
        y0 = self.proj(x)            # (B,1536)
        h  = self.ln1(y0)
        h  = self.fc1(h); h = self.act(h); h = self.drop(h)
        h  = self.fc2(h); h = self.ln2(h)
        y  = y0 + h                  
        return F.normalize(y, dim=1) # for cosine

def info_nce_loss(pred, tgt, temperature=0.05):
    pred = F.normalize(pred, dim=1)
    tgt  = F.normalize(tgt,  dim=1)
    logits = pred @ tgt.T / temperature   # [B,B]
    labels = torch.arange(pred.size(0), device=pred.device)
    return F.cross_entropy(logits, labels)


def train_model_mlp(epochs=50, lr=1e-3, hidden=3072, dropout=0.3, save_path=WORK_ROOT/"model_proj_res.pth",temperature=0.05, accum=1):
    model = TranslatorProj(in_dim=1024, out_dim=1536, hidden=hidden, dropout=dropout).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=3e-4)
    # creation of scheduler with warmup + cosine
    warm = torch.optim.lr_scheduler.LinearLR(opt, start_factor=1e-3, total_iters=5)
    cos  = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs-5)
    sched = torch.optim.lr_scheduler.SequentialLR(opt, schedulers=[warm, cos], milestones=[5])


    with torch.no_grad():
        k = min(100000, X_text.size(0))
        Xsub = X_text[:k].cpu().numpy().astype(np.float32)   # (k,1024)
        Ysub = Y_img[:k].cpu().numpy().astype(np.float32)    # (k,1536)
    
        # normal equations ridge (λ) for balance
        lam = 1e-6
        XTX = Xsub.T @ Xsub                                  # (1024,1024)
        XTY = Xsub.T @ Ysub                                  # (1024,1536)
        XTX_reg = XTX + lam * np.eye(XTX.shape[0], dtype=np.float32)
        W = np.linalg.solve(XTX_reg, XTY)                    # (1024,1536)
    
        model.proj.weight.copy_(torch.from_numpy(W.T).to(model.proj.weight.device))
        model.proj.bias.zero_()
    print("proiections starts (normal equations)")

    
    best_val = float('inf'); patience=8; bad=0
    for ep in range(1, epochs+1):
        model.train(); tr=0.0
        opt.zero_grad(set_to_none=True)
        for i, (xb, yb) in enumerate(train_loader):
            xb, yb = xb.to(device), yb.to(device)
            out  = model(xb)
            loss = info_nce_loss(out, yb, temperature=temperature) / accum
            loss.backward()
            if (i+1) % accum == 0:
                opt.step()
                opt.zero_grad(set_to_none=True)
            tr += loss.item() * accum
        tr /= len(train_loader)

        model.eval(); va=0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                out = model(xb)
                va += info_nce_loss(out, yb, temperature=temperature).item()
        va /= len(val_loader)

        sched.step()
        print(f"Epoch {ep:02d} | lr {sched.get_last_lr()[0]:.2e} | train {tr:.5f} | val {va:.5f}")
# save best
        if va < best_val - 1e-4:
            best_val = va; bad=0
            torch.save(model.state_dict(), save_path)
            print(f"  ✓ Saved best to {save_path} (val={va:.5f})")
        else:
            bad += 1
            if bad >= patience:
                print(f"Early stop (no val improve {patience} epochs). Best val={best_val:.5f}")
                break

    model.load_state_dict(torch.load(save_path, map_location=device))
    return model
model = train_model_mlp(
    epochs=50, lr=1e-3, hidden=2048, dropout=0.3,
    temperature=0.05, accum=2  # accum=1 se batch 1024 ti entra
)


In [None]:
# --- Stats from TRAIN (already loaded first as  X_train_abs) ---


# Calcola mu, sd sul train (embedding TESTO 1024d)
X_train_t = torch.from_numpy(X_train_abs).float()
mu_text = X_train_t.mean(dim=0, keepdim=True)             # (1, 1024)
sd_text = X_train_t.std(dim=0, keepdim=True) + 1e-8       # (1, 1024)

print("Train stats ready:", mu_text.shape, sd_text.shape)


In [None]:
# --- load TEST + preprocess with stats of TRAIN ---



TEST_NPZ = Path("/kaggle/input/aml-competition/test/test/test.clean.npz")

with np.load(TEST_NPZ) as D:
    test_ids  = D["captions/ids"]            # (1500,)
    test_emb  = D["captions/embeddings"]     # (1500,1024)

X_test = torch.from_numpy(test_emb).float()          # (1500,1024)

# standardize with TRAIN, then normalize L2
X_test_std = (X_test - mu_text) / sd_text
X_test_std = F.normalize(X_test_std, dim=1)         # (1500,1024)

# Inference → 1536d
model.eval()
with torch.no_grad():
    # if big, could batching; 1500 good
    Yhat = model(X_test_std.to(device)).detach().cpu()   # (1500,1536),  L2-normalized

print("Yhat shape:", Yhat.shape, " | ids:", test_ids.shape)


In [None]:
# --- Submission CSV (id, embedding)  ---
import numpy as np, pandas as pd, json, torch


ids_np = np.asarray(test_ids, dtype=np.int64)
vec_np = Yhat.detach().cpu().numpy().astype(np.float32)


order = np.argsort(ids_np)
ids_np = ids_np[order]
vec_np = vec_np[order]


emb_list = [json.dumps(row.tolist()) for row in vec_np]


df_final = pd.DataFrame({
    "id": ids_np,
    "embedding": emb_list
})

# Csv
sub_csv_path = "/kaggle/working/submission.csv"
df_final.to_csv(sub_csv_path, index=False, quotechar='"', escapechar='\\')
print("Saved:", sub_csv_path, "| shape:", df_final.shape)
print(df_final.head(2))
