In [None]:
import random
import itertools
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    # Ensures deterministic behavior when using PyTorch's CUDA backend
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # Set a consistent seed for reproducibility

In [None]:
llm_name = "gpt-4o"
text_model_name = 'M-CLIP/LABSE-Vit-L-14'

main_folder = "/content/drive/MyDrive/SemEval2025/task1/"

train_folder = main_folder + "/dataset results gpt4-4o/AdMIRe Subtask A Train/"
train_df_file_path = train_folder + f"subtask_a_train_{llm_name}_meanings.tsv"

dev_folder = main_folder + "/dataset results gpt4-4o/AdMIRe Subtask A Dev/"
dev_df_file_path = dev_folder + f"subtask_a_dev_{llm_name}_meanings.tsv"

In [None]:
# ================================
# 1. Prepare the Dataset
# ================================
def prepare_dataset(df, text_embeddings, img_embeddings, aug_img_embeddings, cap_embeddings, bt_cap_embeddings, pr_cap_embeddings, num_soft_negs=5):
    anc_embeddings = []
    pos_embeddings = []
    neg_embeddings = []

    for i, row in df.iterrows():
        ground_truth = eval(row['expected_order'])
        img_names = [row[f"image{k}_name"] for k in range(1, 6)]
        sorted_indices = [img_names.index(gt) for gt in ground_truth]

        # Prepare anchor-positive pairs
        anc = text_embeddings[i, :]

        pos_img = img_embeddings[i, sorted_indices[0], :]
        pos_aug_img = aug_img_embeddings[i, sorted_indices[0], :]
        pos_cap = cap_embeddings[i, sorted_indices[0], :]
        pos_bt_cap = bt_cap_embeddings[i, sorted_indices[0], :]
        pos_pr_cap = pr_cap_embeddings[i, sorted_indices[0], :]

        # Prepare negatives based on the positive modality
        hard_negs_img = []
        hard_negs_aug_img = []
        hard_negs_cap = []
        hard_negs_bt_cap = []
        hard_negs_pr_cap = []

        for idx in range(5):
            if idx != sorted_indices[0]:
                # Negatives for image positive
                hard_negs_img.append(img_embeddings[i, idx, :])
                hard_negs_aug_img.append(aug_img_embeddings[i, idx, :])
                hard_negs_cap.append(cap_embeddings[i, idx, :])
                hard_negs_bt_cap.append(bt_cap_embeddings[i, idx, :])
                hard_negs_pr_cap.append(pr_cap_embeddings[i, idx, :])

        # Soft negatives from different rows
        soft_negs_img = []
        soft_negs_aug_img = []
        soft_negs_cap = []
        soft_negs_bt_cap = []
        soft_negs_pr_cap = []

        selected_rows = set()
        count = 0
        while count < num_soft_negs:
            soft_neg_row = random.randint(0, len(df) - 1)
            if soft_neg_row != i and soft_neg_row not in selected_rows:
                selected_rows.add(soft_neg_row)

                # Collect soft negatives from the same modality as the positive
                for idx in range(5):
                    soft_negs_img.append(img_embeddings[soft_neg_row, idx, :])
                    soft_negs_aug_img.append(aug_img_embeddings[soft_neg_row, idx, :])
                    soft_negs_cap.append(cap_embeddings[soft_neg_row, idx, :])
                    soft_negs_bt_cap.append(bt_cap_embeddings[soft_neg_row, idx, :])
                    soft_negs_pr_cap.append(pr_cap_embeddings[soft_neg_row, idx, :])

                count += 1

        # Combine hard and soft negatives for each modality
        all_negs_img = np.vstack(hard_negs_img + soft_negs_img)
        all_negs_aug_img = np.vstack(hard_negs_aug_img + soft_negs_aug_img)
        all_negs_cap = np.vstack(hard_negs_cap + soft_negs_cap)
        all_negs_bt_cap = np.vstack(hard_negs_bt_cap + soft_negs_bt_cap)
        all_negs_pr_cap = np.vstack(hard_negs_pr_cap + soft_negs_pr_cap)

        # Append to dataset
        anc_embeddings.append(anc)
        pos_embeddings.append([pos_img, pos_aug_img, pos_cap, pos_bt_cap, pos_pr_cap])
        neg_embeddings.append([all_negs_img, all_negs_aug_img, all_negs_cap, all_negs_bt_cap, all_negs_pr_cap])

    return np.array(anc_embeddings), np.array(pos_embeddings), np.array(neg_embeddings)


# ================================
# 2. Dataset Class
# ================================
class ContrastiveDataset(Dataset):
    def __init__(self, anc_embeddings, pos_embeddings, neg_embeddings):
        self.anc_embeddings = anc_embeddings
        self.pos_embeddings = pos_embeddings
        self.neg_embeddings = neg_embeddings

    def __len__(self):
        return len(self.anc_embeddings)

    def __getitem__(self, idx):
        anc = torch.tensor(self.anc_embeddings[idx], dtype=torch.float32)

        # List of positives for each modality (text-image, text-aug image, etc.)
        pos = [torch.tensor(pos_emb, dtype=torch.float32) for pos_emb in self.pos_embeddings[idx]]

        # List of negatives for each modality (text-image, text-aug image, etc.)
        negs = [torch.tensor(neg_emb, dtype=torch.float32) for neg_emb in self.neg_embeddings[idx]]

        return anc, pos, negs


# ================================
# 3. Model Definition
# ================================
class ContrastiveModel(nn.Module):
    def __init__(self, embedding_dim, projection_dim=768, dropout_rate=0.1):
        super(ContrastiveModel, self).__init__()
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim, projection_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(projection_dim, projection_dim)
        )

    def forward(self, anchor, positive, negatives):
        # Project all embeddings to the same latent space
        anchor_out = self.projection(anchor)
        positive_out = [self.projection(p) for p in positive]
        negative_outs = [self.projection(neg) for neg in negatives]  # negatives: [batch_size, num_negatives, embedding_dim]

        # Normalize the outputs to the unit sphere
        anchor_out = F.normalize(anchor_out, dim=-1)
        positive_out = [F.normalize(p_out, dim=-1) for p_out in positive_out]
        negative_outs = [F.normalize(neg_out, dim=-1) for neg_out in negative_outs]

        return anchor_out, positive_out, negative_outs


# ================================
# 4. Loss
# ================================
class InfoNCELoss(nn.Module):
    def __init__(self, temperature=0.07):
        super(InfoNCELoss, self).__init__()
        self.temperature = temperature

    def forward(self, anchor, positive, negatives):
        # Normalize embeddings
        anchor = F.normalize(anchor, p=2, dim=1)  # [Batch, Dim]
        positive = [F.normalize(p, p=2, dim=1) for p in positive]  # [Batch, Dim] for each positive
        negatives = [F.normalize(neg, p=2, dim=2) for neg in negatives]  # [Batch, num_negatives, Dim]

        # Initialize loss
        total_loss = 0

        # Compute InfoNCE loss for each modality
        for p, neg in zip(positive, negatives):
            # Positive similarity
            pos_sim = torch.exp(F.cosine_similarity(anchor, p) / self.temperature)  # [Batch]

            # Negative similarities (per anchor for each modality)
            neg_sim = torch.matmul(anchor.unsqueeze(1), neg.permute(0, 2, 1)).squeeze(1)  # [Batch, num_negatives]

            # Apply temperature scaling to the negative similarities
            neg_sim = torch.exp(neg_sim / self.temperature)  # [Batch, num_negatives]

            # Compute the sum of negative similarities
            neg_sim_sum = torch.sum(neg_sim, dim=1)  # [Batch]

            # Compute InfoNCE loss using all negative samples for the modality
            loss = -torch.log(pos_sim / (pos_sim + neg_sim_sum))

            total_loss += loss.mean()

        return total_loss / len(positive)  # Average loss over all modalities


# ================================
# 5. Train the Model
# ================================
class EarlyStopping:
    def __init__(self, patience=5):
        self.patience = patience
        self.best_val_loss = float('inf')
        self.counter = 0

    def step(self, val_loss):
        if val_loss < self.best_val_loss:
            self.best_val_loss = val_loss
            self.counter = 0
            return False  # Continue training
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True  # Stop training
        return False


def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        # Training Phase
        for anchor, positive, negatives in train_loader:
            optimizer.zero_grad()
            anchor_out, positive_out, negative_outs = model(anchor, positive, negatives)
            loss = criterion(anchor_out, positive_out, negative_outs)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation Phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for anchor, positive, negatives in val_loader:
                anchor_out, positive_out, negative_outs = model(anchor, positive, negatives)
                loss = criterion(anchor_out, positive_out, negative_outs)
                val_loss += loss.item()

        # Average losses
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)


        top1_acc, avg_corr = evaluate_model(df_test, text_test, img_test, cap_test, model)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Test Acc: {top1_acc:.4f}, Test Corr: {avg_corr:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_contrastive_model.pth')
            print("✅ Model saved!")

        results.append({
            'epoch': epoch + 1,
            'batch_size': batch_size,
            'learning_rate': lr,
            'num_soft_negs': num_soft_negs,
            'temperature': temperature,
            'dropout_rate': dropout_rate,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'test_accuracy': top1_acc,
            'test_corr': avg_corr
        })

        if early_stopping.step(val_loss):
            print("Early stopping triggered!")
            break


# Function to predict the most similar images
def predict(anchor_embedding, candidate_embeddings, model):
    """
    Given an anchor text embedding and candidate image embeddings, predict the similarity scores.
    """
    anchor_tensor = torch.tensor(anchor_embedding, dtype=torch.float32).unsqueeze(0)  # Shape: (1, embedding_dim)
    candidate_tensors = torch.tensor(candidate_embeddings, dtype=torch.float32)        # Shape: (num_candidates, embedding_dim)

    with torch.no_grad():
        anchor_out = model.projection(anchor_tensor)         # Shape: (1, embedding_dim)
        candidate_outs = model.projection(candidate_tensors) # Shape: (num_candidates, embedding_dim)

    similarities = torch.nn.functional.cosine_similarity(anchor_out, candidate_outs)  # Shape: (num_candidates,)
    return similarities.tolist()


# Function to evaluate the model on the test set
def evaluate_model(df_test, text_test, img_test, cap_test, model):
    """
    Evaluate the model on the test set using accuracy and Spearman's correlation.
    """
    model.eval()
    expected_orders = []
    counts = []
    correlations = []

    for i, row in df_test.iterrows():
        img_names = [row[f"image{k}_name"] for k in range(1, 6)]

        # Get embeddings for the anchor and candidate images and predict similarities
        anchor_embedding = text_test[i, :]
        candidate_embeddings = img_test[i, :, :]
        text_image_similarities = predict(anchor_embedding, candidate_embeddings, model)

        # Get embeddings for the anchor and candidate captions and predict similarities
        anchor_embedding = text_test[i, :]
        candidate_embeddings = cap_test[i, :, :]
        text_cap_similarities = predict(anchor_embedding, candidate_embeddings, model)

        similarities = [a+b for a,b in zip(text_image_similarities, text_cap_similarities)]

        sorted_indices = np.argsort(similarities)[::-1]

        # Sort the image names by predicted similarities
        sorted_img_names = [img_names[k] for k in sorted_indices]

        # Save the predicted order
        expected_orders.append(str(sorted_img_names))

        # Accuracy check: Top-1 match
        if not any(df_test["expected_order"].isna()):
            ground_truth = eval(row['expected_order'])

            # Top-1 accuracy
            counts.append(1 if sorted_img_names[0] == ground_truth[0] else 0)

            # Spearman's correlation
            correlation, _ = spearmanr(sorted_img_names, ground_truth)
            correlations.append(correlation)

    # Compute overall metrics
    if counts:
        top1_accuracy = sum(counts) / len(counts)
        avg_spearman_corr = sum(correlations) / len(correlations)
    else:
        top1_accuracy = None
        avg_spearman_corr = None

    return top1_accuracy, avg_spearman_corr


# ================================
# 6. Main Pipeline
# ================================

# Load the training and development datasets
train_df = pd.read_csv(train_df_file_path, sep='\t')

# Load the embeddings
text_embeddings = np.load(train_folder + f"training_text_embeddings_{llm_name}_{text_model_name.replace('/','-')}.npy")
img_embeddings = np.load(train_folder + f"training_img_embeddings_{llm_name}_{text_model_name.replace('/','-')}.npy")
cap_embeddings = np.load(train_folder + f"training_cap_embeddings_{llm_name}_{text_model_name.replace('/','-')}.npy")
aug_img_embeddings = np.load(train_folder + f"aug_img_embeddings_gpt-3.5.npy")
bt_cap_embeddings = np.load(train_folder + f"bt_cap_embeddings_gpt-3.5.npy")
pr_cap_embeddings = np.load(train_folder + f"pr_cap_embeddings_gpt-3.5.npy")

# Print basic info
print("Train file:", train_df_file_path)
print("Size of training dataset:", len(train_df))
print("Text embeddings shape:", text_embeddings.shape)
print("Image embeddings shape:", img_embeddings.shape)
print("Caption embeddings shape:", cap_embeddings.shape)
print("Aug Image embeddings shape:", aug_img_embeddings.shape)
print("BT Caption embeddings shape:", bt_cap_embeddings.shape)
print("PR Caption embeddings shape:", pr_cap_embeddings.shape)

# ================================
# Split Train/Validation Set
# ================================
# First, split the dataset into Train+Val and Test sets
df_train_val, df_test, text_train_val, text_test, img_train_val, img_test, aug_img_train_val, aug_img_test, cap_train_val, cap_test, bt_cap_train_val, bt_cap_test, pr_cap_train_val, pr_cap_test = train_test_split(
    train_df, text_embeddings, img_embeddings, aug_img_embeddings, cap_embeddings, bt_cap_embeddings, pr_cap_embeddings,
    test_size=0.2, random_state=42
)

# Then, split the Train+Val set into Train and Validation sets
df_train, df_val, text_train, text_val, img_train, img_val, aug_img_train, aug_img_val, cap_train, cap_val, bt_cap_train, bt_cap_val, pr_cap_train, pr_cap_val = train_test_split(
    df_train_val, text_train_val, img_train_val, aug_img_train_val, cap_train_val, bt_cap_train_val, pr_cap_train_val,
    test_size=0.1, random_state=42
)

# Reset the indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Print the sizes of each split
print("\nTraining size:", len(df_train))
print("Validation size:", len(df_val))
print("Test size:", len(df_test))

# Print the sizes of the splits
print("\nTraining size:", len(df_train))
print("\tText embeddings size:", text_train.shape)
print("\tImage embeddings size:", img_train.shape)
print("\tAugmented Image embeddings size:", aug_img_train.shape)
print("\tCaption embeddings size:", cap_train.shape)
print("\tBT Caption embeddings size:", bt_cap_train.shape)
print("\tPR Caption embeddings size:", pr_cap_train.shape)

print("\nValidation size:", len(df_val))
print("\tText embeddings size:", text_val.shape)
print("\tImage embeddings size:", img_val.shape)
print("\tAugmented Image embeddings size:", aug_img_val.shape)
print("\tCaption embeddings size:", cap_val.shape)
print("\tBT Caption embeddings size:", bt_cap_val.shape)
print("\tPR Caption embeddings size:", pr_cap_val.shape)


print("\nValidation size:", len(df_test))
print("\tText embeddings size:", text_test.shape)
print("\tImage embeddings size:", img_test.shape)
print("\tAugmented Image embeddings size:", aug_img_test.shape)
print("\tCaption embeddings size:", cap_test.shape)
print("\tBT Caption embeddings size:", bt_cap_test.shape)
print("\tPR Caption embeddings size:", pr_cap_test.shape)

param_grid = {
    'batch_size': [16],
    'learning_rate': [1e-4],
    'num_soft_negs': [10],
    'temperature': [0.08],
    'dropout_rate' : [0.5]
}

param_combinations = itertools.product(param_grid['batch_size'],
                                        param_grid['learning_rate'],
                                        param_grid['num_soft_negs'],
                                        param_grid['temperature'],
                                        param_grid['dropout_rate'])

results = []

for batch_size, lr, num_soft_negs, temperature, dropout_rate in param_combinations:

  current_combination = (batch_size, lr, num_soft_negs, temperature, dropout_rate)

  print(batch_size, lr, num_soft_negs, temperature, dropout_rate)

  # Prepare train and validation datasets
  anc_train, pos_train, neg_train = prepare_dataset(df_train, text_train, img_train, aug_img_train, cap_train, bt_cap_train,pr_cap_train, num_soft_negs=num_soft_negs)
  anc_val, pos_val, neg_val       = prepare_dataset(df_val, text_val, img_val, aug_img_val, cap_val, bt_cap_val,pr_cap_val )

  # ================================
  # DataLoaders
  # ================================
  train_dataset = ContrastiveDataset(anc_train, pos_train, neg_train)
  val_dataset = ContrastiveDataset(anc_val, pos_val, neg_val)

  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  # ================================
  # Initialize Model, Loss, and Optimizer
  # ================================
  embedding_dim = text_embeddings.shape[1]
  model = ContrastiveModel(embedding_dim, projection_dim=768, dropout_rate=dropout_rate)

  criterion = InfoNCELoss(temperature=temperature)
  optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
  scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)
  early_stopping = EarlyStopping(patience=5)

  train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=500)

  # Convert results to a dataframe
  results_df = pd.DataFrame(results)

  # Save the results to a CSV file
  results_df.to_csv(main_folder + f'/training_results_{llm_name}_{text_model_name.replace("/", "-")}.csv', index=False)

Train file: /content/drive/MyDrive/SemEval2025/task1//dataset results gpt4-4o/AdMIRe Subtask A Train/subtask_a_train_gpt-4o_meanings.tsv
Size of training dataset: 70
Text embeddings shape: (70, 768)
Image embeddings shape: (70, 5, 768)
Caption embeddings shape: (70, 5, 768)
Aug Image embeddings shape: (70, 5, 768)
BT Caption embeddings shape: (70, 5, 768)
PR Caption embeddings shape: (70, 5, 768)

Training size: 50
Validation size: 6
Test size: 14

Training size: 50
	Text embeddings size: (50, 768)
	Image embeddings size: (50, 5, 768)
	Augmented Image embeddings size: (50, 5, 768)
	Caption embeddings size: (50, 5, 768)
	BT Caption embeddings size: (50, 5, 768)
	PR Caption embeddings size: (50, 5, 768)

Validation size: 6
	Text embeddings size: (6, 768)
	Image embeddings size: (6, 5, 768)
	Augmented Image embeddings size: (6, 5, 768)
	Caption embeddings size: (6, 5, 768)
	BT Caption embeddings size: (6, 5, 768)
	PR Caption embeddings size: (6, 5, 768)

Validation size: 14
	Text embeddin