#### File to test out Autoencoder and VAE

In [2]:
#!mkdir data
#!gdown 1CVAQDuPOiwm8h9LJ8a_oOs6zOWS6EgkB
#!gdown 1ykZ9fjTxUwdiEwqagoYZiMcD5aG-7rHe
#!unzip -o test.zip -d data
#!unzip -o train.zip -d data
from google.colab import drive
drive.mount('/content/drive')
!git clone https://github.com/Mamiglia/challenge.git
!wget https://raw.githubusercontent.com/tam4x/aml_challenge/refs/heads/main/preprocess_data.py


Mounted at /content/drive
Cloning into 'challenge'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 98 (delta 39), reused 72 (delta 26), pack-reused 0 (from 0)[K
Receiving objects: 100% (98/98), 21.03 MiB | 20.27 MiB/s, done.
Resolving deltas: 100% (39/39), done.
--2025-11-12 15:51:45--  https://raw.githubusercontent.com/tam4x/aml_challenge/refs/heads/main/preprocess_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8066 (7.9K) [text/plain]
Saving to: ‘preprocess_data.py’


2025-11-12 15:51:45 (121 MB/s) - ‘preprocess_data.py’ saved [8066/8066]



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from pathlib import Path
from tqdm import tqdm

from challenge.src.common import load_data, prepare_train_data, generate_submission

#### Create Neural Network Architectures

##### Normal Autoencoder

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TranslatorAE(nn.Module):
    def __init__(self, input_dim, latent_dim, output_dim, hidden_dim=512, n_layers=3, dropout=0.1):
        super().__init__()

        layers = []
        in_dim = input_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, latent_dim))
        self.encoder = nn.Sequential(*layers)

        layers = []
        in_dim = latent_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim

        layers.append(nn.Linear(in_dim, output_dim))
        self.decoder = nn.Sequential(*layers)

        self.skip = nn.Linear(input_dim, output_dim)
        self.output_ln = nn.LayerNorm(output_dim)

    def forward(self, x):
        z = self.encoder(x)
        y_pred = self.decoder(z)
        out = y_pred + self.skip(x)
        return self.output_ln(out)


##### Autoencoder with Residual MLP

In [5]:
class ResidualMLPHead(nn.Module):
    def __init__(self, dim, hidden_dim=512, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim)
        )

    def forward(self, x):
        return x + self.net(x)

class TranslatorRAE(nn.Module):
    def __init__(self, input_dim, latent_dim, output_dim, hidden_dim=512, n_layers=3, dropout=0.1):
        super().__init__()

        layers = []
        in_dim = input_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, latent_dim))
        self.encoder = nn.Sequential(*layers)

        layers = []
        in_dim = latent_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim

        layers.append(nn.Linear(in_dim, output_dim))
        self.decoder = nn.Sequential(*layers)

        self.residual_head = ResidualMLPHead(output_dim, hidden_dim=output_dim//2, dropout=dropout)

    def forward(self, x):
        z = self.encoder(x)
        y_pred = self.decoder(z)
        y_final = self.residual_head(y_pred)
        return y_final


##### VAE

In [6]:
class VAETranslator(nn.Module):
    def __init__(self, input_dim, latent_dim, output_dim, hidden_dims=512, n_layers=3, dropout=0.1):
        super().__init__()

        # Encoder
        layers = []
        in_dim = input_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(in_dim, hidden_dims))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dims
        layers.append(nn.Linear(in_dim, latent_dim * 2))  # output μ and logσ
        self.encoder = nn.Sequential(*layers)

        # Decoder
        layers = []
        in_dim = latent_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(in_dim, hidden_dims))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_dims
        layers.append(nn.Linear(in_dim, output_dim))
        self.decoder = nn.Sequential(*layers)

        # Optional residual refinement
        self.residual_head = ResidualMLPHead(output_dim, hidden_dim=output_dim//2, dropout=dropout)

    def forward(self, x):
        # Encode input → (μ, logσ)
        stats = self.encoder(x)
        mu, log_sigma = stats.chunk(2, dim=-1)
        sigma = torch.exp(log_sigma)

        # Reparameterization trick
        eps = torch.randn_like(sigma)
        z = mu + sigma * eps

        # Decode and refine
        y_base = self.decoder(z)
        y_final = self.residual_head(y_base)
        return y_final, mu, log_sigma

    def kl_loss(self, mu, log_sigma):
        return -0.5 * torch.sum(1 + 2*log_sigma - mu.pow(2) - torch.exp(2*log_sigma), dim=-1).mean()


### Training Loop and NCE Loss aswell as Procrustes Init

In [7]:

class QueueInfoNCELoss(nn.Module):
    def __init__(self, dim, temperature=0.07, queue_size=4096):
        super().__init__()
        self.temperature = temperature
        self.queue_size = queue_size
        # queue shape: (queue_size, dim)
        self.register_buffer("queue", torch.randn(queue_size, dim))
        self.queue = F.normalize(self.queue, dim=1)
        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

    @torch.no_grad()
    def _enqueue(self, keys):
        """
        keys: tensor (B, dim), already detached, normalized, on same device as queue.
        This writes keys into the circular queue. Safe to call only AFTER backward.
        """
        batch_size = keys.shape[0]
        ptr = int(self.queue_ptr.item())
        end_ptr = (ptr + batch_size) % self.queue_size

        if end_ptr > ptr:
            self.queue[ptr:end_ptr] = keys
        else:
            # wrap
            first_len = self.queue_size - ptr
            self.queue[ptr:] = keys[:first_len]
            self.queue[:end_ptr] = keys[first_len:]
        self.queue_ptr[0] = end_ptr

    def forward(self, z_i, z_j):
        """
        Computes loss using current queue as negatives but does NOT modify the queue.
        z_i: (B, dim) predicted (text -> img)
        z_j: (B, dim) target (image)
        """
        # normalize
        z_i = F.normalize(z_i, dim=1)
        z_j = F.normalize(z_j, dim=1)

        # positive logits: (B, 1)
        l_pos = torch.sum(z_i * z_j, dim=-1, keepdim=True)

        # negative logits from queue: (B, queue_size)
        # queue is a buffer; safe to read
        l_neg = torch.matmul(z_i, self.queue.T)

        # logits: (B, 1 + queue_size)
        logits = torch.cat([l_pos, l_neg], dim=1)
        logits /= self.temperature

        labels = torch.zeros(logits.size(0), dtype=torch.long, device=z_i.device)  # positives at index 0

        loss = F.cross_entropy(logits, labels)
        return loss


In [32]:
# ====== Procrustes initialization ======
def procrustes_init(text_embs, img_embs):
    """
    text_embs: (N, d_text)
    img_embs:  (N, d_img)
    returns: weight matrix (d_img, d_text)
    """
    # Center both
    X = text_embs - text_embs.mean(0, keepdim=True)
    Y = img_embs - img_embs.mean(0, keepdim=True)

    # Compute SVD of cross-covariance
    U, _, Vt = torch.linalg.svd(X.T @ Y, full_matrices=False)
    W = U @ Vt  # orthogonal map d_text→d_img
    return W.T   # shape (d_img, d_text) for nn.Linear weight


def apply_procrustes_init_to_final(model, text_sample, img_sample):
    """
    Apply Procrustes initialization to a model.
    - For MLP / ResidualMLP: apply to final Linear layer (hidden -> img_dim)
    - For TransformerTranslator: apply to first projection (text_dim -> img_dim)
    """
    with torch.no_grad():
        # Compute Procrustes matrix
        W = procrustes_init(text_embs=text_sample, img_embs=img_sample)

        # Apply to the appropriate layer
        applied = False
        for name, m in model.named_modules():
            if isinstance(m, nn.Linear):
                # Transformer: apply to first projection (proj_in)
                if isinstance(model, TranslatorAE) and name.endswith("skip"):
                    print(m.weight.shape, W.shape)
                    if m.weight.shape == W.shape:
                        m.weight.copy_(W)
                        applied = True
                        break
                elif isinstance(model, TranslatorRAE) and name.endswith("proj_in"):
                    print(m.weight.shape, W.shape)
                    if m.weight.shape == W.shape:
                        m.weight.copy_(W)
                        applied = True
                        break
                elif isinstance(model, VAETranslator) and name.endswith("proj_in"):
                    print(m.weight.shape, W.shape)
                    if m.weight.shape == W.shape:
                        m.weight.copy_(W)
                        applied = True
                        break

        if not applied:
            print("⚠️ Warning: Could not find matching layer for Procrustes init")
    return model


In [33]:
# ---------- Training loop with Procrustes + InfoNCE ----------
def training(model, train_loader, val_loader, device, epochs, lr, MODEL_PATH,
             use_procrustes_init=True, procrustes_subset=50000, temperature=0.07,
             queue_size=4098):
    """Train LatentSpaceTranslator with optional Procrustes init + InfoNCE loss."""
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=5e-3)
    best_val_loss = float('inf')

    # --- Optional: Procrustes initialization ---
    if use_procrustes_init:
        print("Computing Procrustes initialization...")
        text_list, img_list = [], []
        for i, (X, y) in enumerate(train_loader):
            text_list.append(X.cpu())
            img_list.append(y.cpu())
            if sum(t.shape[0] for t in text_list) >= procrustes_subset:
                break
        text_sample = torch.cat(text_list, dim=0)[:procrustes_subset]
        img_sample = torch.cat(img_list, dim=0)[:procrustes_subset]
        model = apply_procrustes_init_to_final(model, text_sample, img_sample)

    criterion = QueueInfoNCELoss(dim=1536, temperature=temperature, queue_size=queue_size).to(device)
    name = model.__class__.__name__
    # --- Training ---
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()

            loss = 0.0
            if name == "VAETranslator":
                pred, mu, log_sigma = model(X_batch)
                kl_loss = model.kl_loss(mu, log_sigma)
                loss += 1e-4 * kl_loss
            else:
                pred = model(X_batch)
            # Weighted combination of losses
            loss += criterion(pred, y_batch)
            loss += 1 - F.cosine_similarity(
                F.normalize(pred, dim=-1),
                F.normalize(y_batch, dim=-1)
            ).mean()
            loss += 0.1 * F.mse_loss(pred, y_batch)

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()

            with torch.no_grad():
              keys = F.normalize(y_batch, dim=1).detach()   # image embeddings (targets) as keys
              # put them into the queue
              criterion._enqueue(keys)

        train_loss /= len(train_loader)

        # --- Validation ---
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                loss = 0.0
                if name == "VAETranslator":
                    pred, mu, log_sigma = model(X_batch)
                    kl_loss = model.kl_loss(mu, log_sigma)
                    loss += 1e-4 * kl_loss
                else:
                    pred = model(X_batch)
                # Weighted combination of losses
                loss += criterion(pred, y_batch)
                loss += 1 - F.cosine_similarity(
                    F.normalize(pred, dim=-1),
                    F.normalize(y_batch, dim=-1)
                ).mean()
                loss += 0.1 * F.mse_loss(pred, y_batch)

                val_loss += loss.item()

                keys = F.normalize(y_batch, dim=1).detach()   # image embeddings (targets) as keys
                criterion._enqueue(keys)


        val_loss /= len(val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")

        # --- Save best model ---
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            Path(MODEL_PATH).parent.mkdir(parents=True, exist_ok=True)
            torch.save(model.state_dict(), MODEL_PATH)
            print(f"  ✓ Saved best model (val_loss={val_loss:.6f})")

    return model

### Load the Data

In [10]:
# 4. Data Augmentation
# 5. Zero Shot Stitching
# 6. Triplet Loss / Improve InfoNCE Loss / bidirectional / SimCLR / MoCo
# 7. Autoencoder
# Configuration
EPOCHS = 100
BATCH_SIZE = 512
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load data
train_data = load_data("drive/MyDrive/data/train/train.npz")
#train_data = load_data('data/train/train.npz')
X, y, label = prepare_train_data(train_data)
DATASET_SIZE = len(X)
# Split train/val
# This is done only to measure generalization capabilities, you don't have to
# use a validation set (though we encourage this)
n_train = int(0.9 * len(X))
TRAIN_SPLIT = torch.zeros(len(X), dtype=torch.bool)
TRAIN_SPLIT[:n_train] = 1
X_train, X_val = X[TRAIN_SPLIT], X[~TRAIN_SPLIT]
y_train, y_val = y[TRAIN_SPLIT], y[~TRAIN_SPLIT]

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
y_train.shape, X_train.shape, train_loader.batch_size, val_loader.batch_size

(125000,)
Train data: 125000 captions, 125000 images


(torch.Size([112500, 1536]), torch.Size([112500, 1024]), 512, 512)

### Hyperparameter Optimization

In [11]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [34]:
import optuna

def objective_extended(arch, trial, train_dataloader, val_dataloader, device, MODEL_PATH_BASE):

    # --- Common hyperparameters ---
    dropout = trial.suggest_float("dropout", 0.1, 0.3)
    #lr = trial.suggest_loguniform("lr", 5e-4, 1e-2)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-3)

    # --- New hyperparameters ---
    temperature = trial.suggest_float("temperature", 0.01, 0.2)
    queue_size = trial.suggest_categorical("queue_size", [2048, 4098, 8196])
    #w_infonce = trial.suggest_float("w_infonce", 0.6, 0.8)
    #w_cos = trial.suggest_float("w_cos", 0.4, 1.0)
    #w_mse = trial.suggest_float("w_mse", 1.0 - w_cos, 1.0)
    procrustes_subset = 50000

    # --- Architecture-specific hyperparameters ---
    if arch in ["AE", "RAE", "VAE"]:
        hidden_dim = trial.suggest_categorical("hidden_dim", [512, 1024, 2048])
        num_layers = trial.suggest_int("num_layers", 2, 6)
        if arch == "AE":
            latent_dim = trial.suggest_categorical("latent_dim", [1024, 1536, 2048])
            model = TranslatorAE(
                input_dim=1024, latent_dim=latent_dim, output_dim=1536, hidden_dim=hidden_dim,
                n_layers=num_layers, dropout=dropout
            ).to(device)
        elif arch == "RAE":
            latent_dim = trial.suggest_categorical("latent_dim", [1024, 1536, 2048])
            model = TranslatorRAE(
                input_dim=1024, latent_dim=latent_dim, output_dim=1536, hidden_dim=hidden_dim,
                n_layers=num_layers, dropout=dropout
            ).to(device)
        else:
            latent_dim = trial.suggest_categorical("latent_dim", [512, 768, 1024])
            model = VAETranslator(
                input_dim=1024, latent_dim= latent_dim, output_dim=1536, hidden_dims=hidden_dim,
                n_layers=num_layers, dropout=dropout
            ).to(device)

    # --- Apply Procrustes initialization ---
    if procrustes_subset > 0:
        # Get subset from train_loader
        text_list, img_list = [], []
        for i, (X, y) in enumerate(train_loader):
            text_list.append(X.cpu())
            img_list.append(y.cpu())
            if sum(t.shape[0] for t in text_list) >= procrustes_subset:
                break
        text_sample = torch.cat(text_list, dim=0)[:procrustes_subset]
        img_sample = torch.cat(img_list, dim=0)[:procrustes_subset]
        model = apply_procrustes_init_to_final(model, text_sample, img_sample)

    criterion = QueueInfoNCELoss(dim=1536, temperature=temperature, queue_size=queue_size).to(device)
    # --- Training loop (short run) ---
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=weight_decay)
    model.train()
    torch.autograd.set_detect_anomaly(True)
    for epoch in range(5):  # short training
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            loss = 0.0
            if arch == "VAE":
                pred, mu, log_sigma = model(X_batch)
                kl_loss = model.kl_loss(mu, log_sigma)
                loss += 1e-4 * kl_loss
            else:
                pred = model(X_batch)
            # Weighted combination of losses
            loss += criterion(pred, y_batch)
            loss += 1 - F.cosine_similarity(
                F.normalize(pred, dim=-1),
                F.normalize(y_batch, dim=-1)
            ).mean()
            loss += 0.1 * F.mse_loss(pred, y_batch)

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            with torch.no_grad():
              keys = F.normalize(y_batch, dim=1).detach()   # image embeddings (targets) as keys
              # put them into the queue
              criterion._enqueue(keys)

    # --- Evaluate on validation ---
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            loss = 0.0
            if arch == "VAE":
                pred, mu, log_sigma = model(X_batch)
                kl_loss = model.kl_loss(mu, log_sigma)
                loss += 1e-4 * kl_loss
            else:
                pred = model(X_batch)
            # Weighted combination of losses
            loss += criterion(pred, y_batch)
            loss += 1 - F.cosine_similarity(
                F.normalize(pred, dim=-1),
                F.normalize(y_batch, dim=-1)
            ).mean()
            loss += 0.1 * F.mse_loss(pred, y_batch)
            val_loss += loss.item()

            keys = F.normalize(y_batch, dim=1).detach()
            criterion._enqueue(keys)

    val_loss /= len(val_loader)

    return val_loss


def run_optuna_extended(arch, train_dataloader, val_dataloader, device, MODEL_PATH_BASE, n_trials=20):
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective_extended(arch, trial, train_dataloader, val_dataloader, device, MODEL_PATH_BASE),
                   n_trials=n_trials)

    print("Best trial:")
    trial = study.best_trial
    print(f"Val loss: {trial.value}")
    print("Best hyperparameters:")
    for key, value in trial.params.items():
        print(f"  {key}: {value}")

    return trial.params

In [35]:
archs = ['AE', 'RAE', 'VAE']
choosen_arch = archs[0]
best_params = run_optuna_extended(
    arch = choosen_arch,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    device=DEVICE,
    MODEL_PATH_BASE="models/translator_optuna"
)

[I 2025-11-12 18:48:51,390] A new study created in memory with name: no-name-a4cc3219-9c75-437a-b79a-216c1e0477d0
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-3)


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:49:58,774] Trial 0 finished with value: 6.606437664031983 and parameters: {'dropout': 0.10011912045944094, 'weight_decay': 0.00032382093761147067, 'temperature': 0.09045946900103656, 'queue_size': 4098, 'hidden_dim': 512, 'num_layers': 6, 'latent_dim': 2048}. Best is trial 0 with value: 6.606437664031983.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:51:44,251] Trial 1 finished with value: 5.379280509948731 and parameters: {'dropout': 0.2653587185666726, 'weight_decay': 3.336155675814686e-05, 'temperature': 0.04653578867613997, 'queue_size': 4098, 'hidden_dim': 2048, 'num_layers': 6, 'latent_dim': 1024}. Best is trial 1 with value: 5.379280509948731.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:52:30,269] Trial 2 finished with value: 6.400638256072998 and parameters: {'dropout': 0.16638782921074685, 'weight_decay': 0.00017704942855714014, 'temperature': 0.12509587590742538, 'queue_size': 2048, 'hidden_dim': 1024, 'num_layers': 2, 'latent_dim': 1024}. Best is trial 1 with value: 5.379280509948731.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:53:41,340] Trial 3 finished with value: 5.173537311553955 and parameters: {'dropout': 0.15010144494651417, 'weight_decay': 1.1398841566413616e-05, 'temperature': 0.060453491733708134, 'queue_size': 2048, 'hidden_dim': 1024, 'num_layers': 6, 'latent_dim': 2048}. Best is trial 3 with value: 5.173537311553955.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:54:27,299] Trial 4 finished with value: 8.229947166442871 and parameters: {'dropout': 0.21820041702096282, 'weight_decay': 0.0009669510339900876, 'temperature': 0.1716195194589254, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 2, 'latent_dim': 2048}. Best is trial 3 with value: 5.173537311553955.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:56:10,361] Trial 5 finished with value: 6.774692325592041 and parameters: {'dropout': 0.2833100160311791, 'weight_decay': 4.165857702880503e-05, 'temperature': 0.16455641501762616, 'queue_size': 2048, 'hidden_dim': 2048, 'num_layers': 6, 'latent_dim': 1024}. Best is trial 3 with value: 5.173537311553955.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:57:06,324] Trial 6 finished with value: 7.274403743743896 and parameters: {'dropout': 0.16825306433282702, 'weight_decay': 2.637065764289945e-05, 'temperature': 0.138707339551641, 'queue_size': 4098, 'hidden_dim': 512, 'num_layers': 4, 'latent_dim': 1536}. Best is trial 3 with value: 5.173537311553955.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:58:09,251] Trial 7 finished with value: 6.561344184875488 and parameters: {'dropout': 0.19113453060020777, 'weight_decay': 1.241795865443377e-05, 'temperature': 0.1381541280700273, 'queue_size': 2048, 'hidden_dim': 1024, 'num_layers': 5, 'latent_dim': 1024}. Best is trial 3 with value: 5.173537311553955.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 18:59:09,877] Trial 8 finished with value: 6.925979461669922 and parameters: {'dropout': 0.1473139348373667, 'weight_decay': 0.00037022343367573826, 'temperature': 0.07327224915044685, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 5, 'latent_dim': 2048}. Best is trial 3 with value: 5.173537311553955.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:00:05,091] Trial 9 finished with value: 4.976639881134033 and parameters: {'dropout': 0.23308854565786985, 'weight_decay': 0.00017371071242602062, 'temperature': 0.02156883643692692, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 4, 'latent_dim': 1024}. Best is trial 9 with value: 4.976639881134033.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:00:55,243] Trial 10 finished with value: 4.792838516235352 and parameters: {'dropout': 0.23683742652777356, 'weight_decay': 8.469806899815575e-05, 'temperature': 0.013002875636706032, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 3, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:01:45,767] Trial 11 finished with value: 4.80390100479126 and parameters: {'dropout': 0.24057771278452408, 'weight_decay': 9.337826084573294e-05, 'temperature': 0.015958234851807116, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 3, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:02:35,773] Trial 12 finished with value: 4.802303619384766 and parameters: {'dropout': 0.24882502208608342, 'weight_decay': 8.291306558275999e-05, 'temperature': 0.012503188997740566, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 3, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:03:25,287] Trial 13 finished with value: 5.76406795501709 and parameters: {'dropout': 0.29808946298330774, 'weight_decay': 7.928269342021618e-05, 'temperature': 0.03878851222437381, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 3, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:04:15,007] Trial 14 finished with value: 4.810356693267822 and parameters: {'dropout': 0.2536565316341346, 'weight_decay': 5.820384221942449e-05, 'temperature': 0.011848211187552006, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 3, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:05:28,008] Trial 15 finished with value: 8.34574249267578 and parameters: {'dropout': 0.20781904219275782, 'weight_decay': 0.00015532518433213307, 'temperature': 0.19510241719425372, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 3, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:06:23,164] Trial 16 finished with value: 7.160979518890381 and parameters: {'dropout': 0.2688878907597194, 'weight_decay': 1.9720421844141694e-05, 'temperature': 0.08322645759854032, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 4, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:07:07,843] Trial 17 finished with value: 5.832325115203857 and parameters: {'dropout': 0.22958404470729166, 'weight_decay': 5.735091253715288e-05, 'temperature': 0.04042071975621331, 'queue_size': 8196, 'hidden_dim': 512, 'num_layers': 2, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:08:20,569] Trial 18 finished with value: 7.6107992172241214 and parameters: {'dropout': 0.19386887663067237, 'weight_decay': 0.0003602741232686485, 'temperature': 0.11032589588283426, 'queue_size': 8196, 'hidden_dim': 2048, 'num_layers': 3, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


torch.Size([1536, 1024]) torch.Size([1536, 1024])


[I 2025-11-12 19:09:19,122] Trial 19 finished with value: 5.938387470245361 and parameters: {'dropout': 0.24732079297156828, 'weight_decay': 0.00011905210803728367, 'temperature': 0.06278772333742233, 'queue_size': 4098, 'hidden_dim': 1024, 'num_layers': 4, 'latent_dim': 1536}. Best is trial 10 with value: 4.792838516235352.


Best trial:
Val loss: 4.792838516235352
Best hyperparameters:
  dropout: 0.23683742652777356
  weight_decay: 8.469806899815575e-05
  temperature: 0.013002875636706032
  queue_size: 8196
  hidden_dim: 512
  num_layers: 3
  latent_dim: 1536


In [36]:
if choosen_arch == 'AE':
    model = TranslatorAE(
        input_dim=1024,
        latent_dim=best_params['latent_dim'],
        output_dim=1536,
        hidden_dim=best_params['hidden_dim'],
        n_layers=best_params['num_layers'],
        dropout=best_params['dropout']
    ).to(DEVICE)
    MODEL_PATH = "drive/MyDrive/data//models/AE.pth"

elif choosen_arch == 'RAE':
    model = TranslatorRAE(
    input_dim=1024,
    latent_dim=best_params["latent_dim"],
    output_dim=1536,
    hidden_dim=best_params["hidden_dim"],
    n_layers=best_params["num_layers"],
    dropout=best_params["dropout"]).to(DEVICE)
    MODEL_PATH = "drive/MyDrive/data/models/RAE.pth"

else:
    model = VAETranslator(
    input_dim=1024,
    latent_dim=best_params["latent_dim"],
    output_dim=1536,
    hidden_dims=best_params["hidden_dim"],
    n_layers=best_params["num_layers"],
    dropout=best_params["dropout"]).to(DEVICE)
    MODEL_PATH = "drive/MyDrive/data/models/VAE.pth"

In [37]:
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Train
print("\n3. Training...")
model = training(model,
                 train_loader,
                 val_loader,
                 DEVICE,
                 EPOCHS,
                 1e-5,
                 MODEL_PATH,
                 True,
                 10000,
                 best_params["temperature"],
                 best_params["queue_size"])

   Parameters: 5,515,776

3. Training...
Computing Procrustes initialization...
torch.Size([1536, 1024]) torch.Size([1536, 1024])


Epoch 1/100: 100%|██████████| 220/220 [00:09<00:00, 23.15it/s]


Epoch 1: Train Loss = 6.293506, Val Loss = 5.763309
  ✓ Saved best model (val_loss=5.763309)


Epoch 2/100: 100%|██████████| 220/220 [00:09<00:00, 24.04it/s]


Epoch 2: Train Loss = 5.236031, Val Loss = 5.329469
  ✓ Saved best model (val_loss=5.329469)


Epoch 3/100: 100%|██████████| 220/220 [00:08<00:00, 25.19it/s]


Epoch 3: Train Loss = 4.953081, Val Loss = 5.158429
  ✓ Saved best model (val_loss=5.158429)


Epoch 4/100: 100%|██████████| 220/220 [00:09<00:00, 24.02it/s]


Epoch 4: Train Loss = 4.777521, Val Loss = 5.027169
  ✓ Saved best model (val_loss=5.027169)


Epoch 5/100: 100%|██████████| 220/220 [00:09<00:00, 22.11it/s]


Epoch 5: Train Loss = 4.642517, Val Loss = 4.941216
  ✓ Saved best model (val_loss=4.941216)


Epoch 6/100: 100%|██████████| 220/220 [00:08<00:00, 25.38it/s]


Epoch 6: Train Loss = 4.534069, Val Loss = 4.882436
  ✓ Saved best model (val_loss=4.882436)


Epoch 7/100: 100%|██████████| 220/220 [00:09<00:00, 24.00it/s]


Epoch 7: Train Loss = 4.449714, Val Loss = 4.819399
  ✓ Saved best model (val_loss=4.819399)


Epoch 8/100: 100%|██████████| 220/220 [00:09<00:00, 23.18it/s]


Epoch 8: Train Loss = 4.379262, Val Loss = 4.774995
  ✓ Saved best model (val_loss=4.774995)


Epoch 9/100: 100%|██████████| 220/220 [00:09<00:00, 22.58it/s]


Epoch 9: Train Loss = 4.316588, Val Loss = 4.744546
  ✓ Saved best model (val_loss=4.744546)


Epoch 10/100: 100%|██████████| 220/220 [00:08<00:00, 26.13it/s]


Epoch 10: Train Loss = 4.263379, Val Loss = 4.715700
  ✓ Saved best model (val_loss=4.715700)


Epoch 11/100: 100%|██████████| 220/220 [00:09<00:00, 23.30it/s]


Epoch 11: Train Loss = 4.217691, Val Loss = 4.690858
  ✓ Saved best model (val_loss=4.690858)


Epoch 12/100: 100%|██████████| 220/220 [00:09<00:00, 23.32it/s]


Epoch 12: Train Loss = 4.174939, Val Loss = 4.666406
  ✓ Saved best model (val_loss=4.666406)


Epoch 13/100: 100%|██████████| 220/220 [00:09<00:00, 24.08it/s]


Epoch 13: Train Loss = 4.139694, Val Loss = 4.644079
  ✓ Saved best model (val_loss=4.644079)


Epoch 14/100: 100%|██████████| 220/220 [00:08<00:00, 26.44it/s]


Epoch 14: Train Loss = 4.102861, Val Loss = 4.633127
  ✓ Saved best model (val_loss=4.633127)


Epoch 15/100: 100%|██████████| 220/220 [00:09<00:00, 23.28it/s]


Epoch 15: Train Loss = 4.072630, Val Loss = 4.618204
  ✓ Saved best model (val_loss=4.618204)


Epoch 16/100: 100%|██████████| 220/220 [00:09<00:00, 23.97it/s]


Epoch 16: Train Loss = 4.045108, Val Loss = 4.600361
  ✓ Saved best model (val_loss=4.600361)


Epoch 17/100: 100%|██████████| 220/220 [00:08<00:00, 25.95it/s]


Epoch 17: Train Loss = 4.016611, Val Loss = 4.588201
  ✓ Saved best model (val_loss=4.588201)


Epoch 18/100: 100%|██████████| 220/220 [00:09<00:00, 23.70it/s]


Epoch 18: Train Loss = 3.991897, Val Loss = 4.581841
  ✓ Saved best model (val_loss=4.581841)


Epoch 19/100: 100%|██████████| 220/220 [00:09<00:00, 22.79it/s]


Epoch 19: Train Loss = 3.969811, Val Loss = 4.566710
  ✓ Saved best model (val_loss=4.566710)


Epoch 20/100: 100%|██████████| 220/220 [00:09<00:00, 24.13it/s]


Epoch 20: Train Loss = 3.945901, Val Loss = 4.563715
  ✓ Saved best model (val_loss=4.563715)


Epoch 21/100: 100%|██████████| 220/220 [00:08<00:00, 25.41it/s]


Epoch 21: Train Loss = 3.926207, Val Loss = 4.545422
  ✓ Saved best model (val_loss=4.545422)


Epoch 22/100: 100%|██████████| 220/220 [00:09<00:00, 23.88it/s]


Epoch 22: Train Loss = 3.908296, Val Loss = 4.534411
  ✓ Saved best model (val_loss=4.534411)


Epoch 23/100: 100%|██████████| 220/220 [00:09<00:00, 22.59it/s]


Epoch 23: Train Loss = 3.885849, Val Loss = 4.535943


Epoch 24/100: 100%|██████████| 220/220 [00:08<00:00, 26.86it/s]


Epoch 24: Train Loss = 3.868730, Val Loss = 4.528841
  ✓ Saved best model (val_loss=4.528841)


Epoch 25/100: 100%|██████████| 220/220 [00:09<00:00, 24.00it/s]


Epoch 25: Train Loss = 3.850063, Val Loss = 4.521378
  ✓ Saved best model (val_loss=4.521378)


Epoch 26/100: 100%|██████████| 220/220 [00:09<00:00, 22.70it/s]


Epoch 26: Train Loss = 3.835924, Val Loss = 4.514630
  ✓ Saved best model (val_loss=4.514630)


Epoch 27/100: 100%|██████████| 220/220 [00:09<00:00, 23.80it/s]


Epoch 27: Train Loss = 3.818193, Val Loss = 4.506412
  ✓ Saved best model (val_loss=4.506412)


Epoch 28/100: 100%|██████████| 220/220 [00:08<00:00, 26.40it/s]


Epoch 28: Train Loss = 3.803865, Val Loss = 4.499660
  ✓ Saved best model (val_loss=4.499660)


Epoch 29/100: 100%|██████████| 220/220 [00:09<00:00, 23.25it/s]


Epoch 29: Train Loss = 3.788118, Val Loss = 4.499565
  ✓ Saved best model (val_loss=4.499565)


Epoch 30/100: 100%|██████████| 220/220 [00:09<00:00, 22.27it/s]


Epoch 30: Train Loss = 3.775218, Val Loss = 4.490725
  ✓ Saved best model (val_loss=4.490725)


Epoch 31/100: 100%|██████████| 220/220 [00:08<00:00, 26.72it/s]


Epoch 31: Train Loss = 3.761771, Val Loss = 4.485536
  ✓ Saved best model (val_loss=4.485536)


Epoch 32/100: 100%|██████████| 220/220 [00:09<00:00, 23.28it/s]


Epoch 32: Train Loss = 3.746613, Val Loss = 4.481338
  ✓ Saved best model (val_loss=4.481338)


Epoch 33/100: 100%|██████████| 220/220 [00:09<00:00, 23.12it/s]


Epoch 33: Train Loss = 3.732270, Val Loss = 4.479864
  ✓ Saved best model (val_loss=4.479864)


Epoch 34/100: 100%|██████████| 220/220 [00:09<00:00, 22.47it/s]


Epoch 34: Train Loss = 3.719470, Val Loss = 4.473000
  ✓ Saved best model (val_loss=4.473000)


Epoch 35/100: 100%|██████████| 220/220 [00:08<00:00, 26.78it/s]


Epoch 35: Train Loss = 3.708820, Val Loss = 4.472088
  ✓ Saved best model (val_loss=4.472088)


Epoch 36/100: 100%|██████████| 220/220 [00:09<00:00, 22.72it/s]


Epoch 36: Train Loss = 3.695882, Val Loss = 4.466965
  ✓ Saved best model (val_loss=4.466965)


Epoch 37/100: 100%|██████████| 220/220 [00:09<00:00, 23.93it/s]


Epoch 37: Train Loss = 3.682862, Val Loss = 4.461379
  ✓ Saved best model (val_loss=4.461379)


Epoch 38/100: 100%|██████████| 220/220 [00:09<00:00, 24.07it/s]


Epoch 38: Train Loss = 3.674589, Val Loss = 4.461565


Epoch 39/100: 100%|██████████| 220/220 [00:08<00:00, 25.22it/s]


Epoch 39: Train Loss = 3.662640, Val Loss = 4.459600
  ✓ Saved best model (val_loss=4.459600)


Epoch 40/100: 100%|██████████| 220/220 [00:09<00:00, 23.23it/s]


Epoch 40: Train Loss = 3.651554, Val Loss = 4.452475
  ✓ Saved best model (val_loss=4.452475)


Epoch 41/100: 100%|██████████| 220/220 [00:09<00:00, 22.96it/s]


Epoch 41: Train Loss = 3.641567, Val Loss = 4.450024
  ✓ Saved best model (val_loss=4.450024)


Epoch 42/100: 100%|██████████| 220/220 [00:08<00:00, 26.06it/s]


Epoch 42: Train Loss = 3.630697, Val Loss = 4.450016
  ✓ Saved best model (val_loss=4.450016)


Epoch 43/100: 100%|██████████| 220/220 [00:09<00:00, 23.33it/s]


Epoch 43: Train Loss = 3.619734, Val Loss = 4.442074
  ✓ Saved best model (val_loss=4.442074)


Epoch 44/100: 100%|██████████| 220/220 [00:09<00:00, 23.28it/s]


Epoch 44: Train Loss = 3.611053, Val Loss = 4.438085
  ✓ Saved best model (val_loss=4.438085)


Epoch 45/100: 100%|██████████| 220/220 [00:08<00:00, 25.17it/s]


Epoch 45: Train Loss = 3.601037, Val Loss = 4.439276


Epoch 46/100: 100%|██████████| 220/220 [00:09<00:00, 24.31it/s]


Epoch 46: Train Loss = 3.589753, Val Loss = 4.442805


Epoch 47/100: 100%|██████████| 220/220 [00:09<00:00, 23.96it/s]


Epoch 47: Train Loss = 3.580733, Val Loss = 4.430598
  ✓ Saved best model (val_loss=4.430598)


Epoch 48/100: 100%|██████████| 220/220 [00:09<00:00, 23.76it/s]


Epoch 48: Train Loss = 3.571374, Val Loss = 4.433004


Epoch 49/100: 100%|██████████| 220/220 [00:08<00:00, 25.91it/s]


Epoch 49: Train Loss = 3.565524, Val Loss = 4.429566
  ✓ Saved best model (val_loss=4.429566)


Epoch 50/100: 100%|██████████| 220/220 [00:09<00:00, 23.43it/s]


Epoch 50: Train Loss = 3.554340, Val Loss = 4.433814


Epoch 51/100: 100%|██████████| 220/220 [00:09<00:00, 23.19it/s]


Epoch 51: Train Loss = 3.545660, Val Loss = 4.428038
  ✓ Saved best model (val_loss=4.428038)


Epoch 52/100: 100%|██████████| 220/220 [00:08<00:00, 25.90it/s]


Epoch 52: Train Loss = 3.538560, Val Loss = 4.428117


Epoch 53/100: 100%|██████████| 220/220 [00:09<00:00, 22.97it/s]


Epoch 53: Train Loss = 3.530968, Val Loss = 4.426400
  ✓ Saved best model (val_loss=4.426400)


Epoch 54/100: 100%|██████████| 220/220 [00:09<00:00, 23.38it/s]


Epoch 54: Train Loss = 3.521139, Val Loss = 4.421060
  ✓ Saved best model (val_loss=4.421060)


Epoch 55/100: 100%|██████████| 220/220 [00:09<00:00, 23.78it/s]


Epoch 55: Train Loss = 3.514233, Val Loss = 4.420193
  ✓ Saved best model (val_loss=4.420193)


Epoch 56/100: 100%|██████████| 220/220 [00:08<00:00, 25.44it/s]


Epoch 56: Train Loss = 3.504632, Val Loss = 4.417683
  ✓ Saved best model (val_loss=4.417683)


Epoch 57/100: 100%|██████████| 220/220 [00:09<00:00, 23.30it/s]


Epoch 57: Train Loss = 3.498297, Val Loss = 4.418148


Epoch 58/100: 100%|██████████| 220/220 [00:09<00:00, 23.38it/s]


Epoch 58: Train Loss = 3.487796, Val Loss = 4.415196
  ✓ Saved best model (val_loss=4.415196)


Epoch 59/100: 100%|██████████| 220/220 [00:08<00:00, 26.70it/s]


Epoch 59: Train Loss = 3.482982, Val Loss = 4.414967
  ✓ Saved best model (val_loss=4.414967)


Epoch 60/100: 100%|██████████| 220/220 [00:09<00:00, 22.86it/s]


Epoch 60: Train Loss = 3.475713, Val Loss = 4.418942


Epoch 61/100: 100%|██████████| 220/220 [00:09<00:00, 24.17it/s]


Epoch 61: Train Loss = 3.469906, Val Loss = 4.407254
  ✓ Saved best model (val_loss=4.407254)


Epoch 62/100: 100%|██████████| 220/220 [00:09<00:00, 24.36it/s]


Epoch 62: Train Loss = 3.461160, Val Loss = 4.410604


Epoch 63/100: 100%|██████████| 220/220 [00:08<00:00, 25.01it/s]


Epoch 63: Train Loss = 3.454567, Val Loss = 4.407612


Epoch 64/100: 100%|██████████| 220/220 [00:09<00:00, 23.39it/s]


Epoch 64: Train Loss = 3.448523, Val Loss = 4.404198
  ✓ Saved best model (val_loss=4.404198)


Epoch 65/100: 100%|██████████| 220/220 [00:09<00:00, 22.38it/s]


Epoch 65: Train Loss = 3.440342, Val Loss = 4.403581
  ✓ Saved best model (val_loss=4.403581)


Epoch 66/100: 100%|██████████| 220/220 [00:08<00:00, 25.81it/s]


Epoch 66: Train Loss = 3.432615, Val Loss = 4.411491


Epoch 67/100: 100%|██████████| 220/220 [00:09<00:00, 22.46it/s]


Epoch 67: Train Loss = 3.427639, Val Loss = 4.407057


Epoch 68/100: 100%|██████████| 220/220 [00:09<00:00, 23.40it/s]


Epoch 68: Train Loss = 3.420108, Val Loss = 4.404116


Epoch 69/100: 100%|██████████| 220/220 [00:08<00:00, 25.26it/s]


Epoch 69: Train Loss = 3.413754, Val Loss = 4.407080


Epoch 70/100: 100%|██████████| 220/220 [00:08<00:00, 25.61it/s]


Epoch 70: Train Loss = 3.405483, Val Loss = 4.403200
  ✓ Saved best model (val_loss=4.403200)


Epoch 71/100: 100%|██████████| 220/220 [00:09<00:00, 23.90it/s]


Epoch 71: Train Loss = 3.400195, Val Loss = 4.405035


Epoch 72/100: 100%|██████████| 220/220 [00:09<00:00, 23.20it/s]


Epoch 72: Train Loss = 3.393671, Val Loss = 4.393280
  ✓ Saved best model (val_loss=4.393280)


Epoch 73/100: 100%|██████████| 220/220 [00:08<00:00, 26.05it/s]


Epoch 73: Train Loss = 3.387812, Val Loss = 4.397047


Epoch 74/100: 100%|██████████| 220/220 [00:09<00:00, 22.59it/s]


Epoch 74: Train Loss = 3.382617, Val Loss = 4.395593


Epoch 75/100: 100%|██████████| 220/220 [00:09<00:00, 23.33it/s]


Epoch 75: Train Loss = 3.376375, Val Loss = 4.398875


Epoch 76/100: 100%|██████████| 220/220 [00:08<00:00, 26.31it/s]


Epoch 76: Train Loss = 3.368483, Val Loss = 4.389352
  ✓ Saved best model (val_loss=4.389352)


Epoch 77/100: 100%|██████████| 220/220 [00:09<00:00, 23.99it/s]


Epoch 77: Train Loss = 3.363503, Val Loss = 4.399396


Epoch 78/100: 100%|██████████| 220/220 [00:09<00:00, 22.90it/s]


Epoch 78: Train Loss = 3.358511, Val Loss = 4.391198


Epoch 79/100: 100%|██████████| 220/220 [00:09<00:00, 23.77it/s]


Epoch 79: Train Loss = 3.351131, Val Loss = 4.392106


Epoch 80/100: 100%|██████████| 220/220 [00:08<00:00, 26.62it/s]


Epoch 80: Train Loss = 3.347170, Val Loss = 4.397952


Epoch 81/100: 100%|██████████| 220/220 [00:09<00:00, 23.41it/s]


Epoch 81: Train Loss = 3.340952, Val Loss = 4.388542
  ✓ Saved best model (val_loss=4.388542)


Epoch 82/100: 100%|██████████| 220/220 [00:09<00:00, 23.89it/s]


Epoch 82: Train Loss = 3.333778, Val Loss = 4.392952


Epoch 83/100: 100%|██████████| 220/220 [00:08<00:00, 25.10it/s]


Epoch 83: Train Loss = 3.328652, Val Loss = 4.401352


Epoch 84/100: 100%|██████████| 220/220 [00:09<00:00, 24.13it/s]


Epoch 84: Train Loss = 3.325270, Val Loss = 4.392835


Epoch 85/100: 100%|██████████| 220/220 [00:09<00:00, 23.19it/s]


Epoch 85: Train Loss = 3.318302, Val Loss = 4.387938
  ✓ Saved best model (val_loss=4.387938)


Epoch 86/100: 100%|██████████| 220/220 [00:08<00:00, 25.56it/s]


Epoch 86: Train Loss = 3.315341, Val Loss = 4.387193
  ✓ Saved best model (val_loss=4.387193)


Epoch 87/100: 100%|██████████| 220/220 [00:09<00:00, 24.11it/s]


Epoch 87: Train Loss = 3.308742, Val Loss = 4.385248
  ✓ Saved best model (val_loss=4.385248)


Epoch 88/100: 100%|██████████| 220/220 [00:09<00:00, 23.81it/s]


Epoch 88: Train Loss = 3.303177, Val Loss = 4.387883


Epoch 89/100: 100%|██████████| 220/220 [00:09<00:00, 22.64it/s]


Epoch 89: Train Loss = 3.299121, Val Loss = 4.385766


Epoch 90/100: 100%|██████████| 220/220 [00:08<00:00, 26.89it/s]


Epoch 90: Train Loss = 3.291739, Val Loss = 4.389777


Epoch 91/100: 100%|██████████| 220/220 [00:09<00:00, 23.38it/s]


Epoch 91: Train Loss = 3.287430, Val Loss = 4.390826


Epoch 92/100: 100%|██████████| 220/220 [00:09<00:00, 23.96it/s]


Epoch 92: Train Loss = 3.282601, Val Loss = 4.380790
  ✓ Saved best model (val_loss=4.380790)


Epoch 93/100: 100%|██████████| 220/220 [00:08<00:00, 25.28it/s]


Epoch 93: Train Loss = 3.277838, Val Loss = 4.390615


Epoch 94/100: 100%|██████████| 220/220 [00:09<00:00, 23.59it/s]


Epoch 94: Train Loss = 3.270546, Val Loss = 4.390339


Epoch 95/100: 100%|██████████| 220/220 [00:09<00:00, 23.39it/s]


Epoch 95: Train Loss = 3.267782, Val Loss = 4.381950


Epoch 96/100: 100%|██████████| 220/220 [00:08<00:00, 24.59it/s]


Epoch 96: Train Loss = 3.262965, Val Loss = 4.386014


Epoch 97/100: 100%|██████████| 220/220 [00:08<00:00, 25.69it/s]


Epoch 97: Train Loss = 3.256033, Val Loss = 4.393468


Epoch 98/100: 100%|██████████| 220/220 [00:09<00:00, 23.72it/s]


Epoch 98: Train Loss = 3.253084, Val Loss = 4.390576


Epoch 99/100: 100%|██████████| 220/220 [00:09<00:00, 23.31it/s]


Epoch 99: Train Loss = 3.248466, Val Loss = 4.386397


Epoch 100/100: 100%|██████████| 220/220 [00:08<00:00, 26.88it/s]


Epoch 100: Train Loss = 3.243366, Val Loss = 4.383807


In [38]:
model.load_state_dict(torch.load(MODEL_PATH))
test_data = load_data("drive/MyDrive/data/test/test.clean.npz")

test_embds = test_data['captions/embeddings']
test_embds = torch.from_numpy(test_embds).float()

with torch.no_grad():
    if choosen_arch == 'VAE':
      pred_embds, _, _ = model(test_embds.to(DEVICE))
      pred_embds = pred_embds.cpu()
    else:
      pred_embds = model(test_embds.to(DEVICE)).cpu()

submission = generate_submission(test_data['captions/ids'], pred_embds, f'{choosen_arch}_submission.csv')
print(f"Model saved to: {MODEL_PATH}")

Generating submission file...
✓ Saved submission to AE_submission.csv
Model saved to: drive/MyDrive/data//models/AE.pth
