In [46]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
import random
import pandas as pd
import os
import ast
import numpy as np

In [47]:
import numpy as np
print(np.__version__)

1.26.4


In [48]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENROUTER_API_KEY")

In [49]:
from openai import OpenAI

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key
)

def gloss_text(text):
    try:
        response = client.chat.completions.create(
            model="deepseek/deepseek-r1-zero:free",
            messages=[
                {"role": "system", "content": "You convert idiomatic or ambiguous English sentences into clear literal gloss sentences."},
                {"role": "user", "content": f"Gloss this sentence: {text}"}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Glossing failed: {e}")
        return text  # fallback to original

In [50]:
print(gloss_text("It's raining cats and dogs"))

\boxed{"It is raining very heavily."}


In [51]:
class TripletDataset(Dataset):
    def __init__(self, df):
        self.anchor_positive_negative_triplets = []

        for index, row in df.iterrows():
            expected_order = ast.literal_eval(row["expected_order"])
            for i in range(0, 5):
                for j in range(i+1, 5):
                    self.anchor_positive_negative_triplets.append((row["sentence"], 
                                                                   os.path.join("train", row["compound"].replace("'s", "_s"), expected_order[i]), 
                                                                   os.path.join("train", row["compound"].replace("'s", "_s"), expected_order[j])))
        

    def __len__(self):
        return len(self.anchor_positive_negative_triplets)

    def __getitem__(self, idx):
        anchor_text, pos_img_path, neg_img_path = self.anchor_positive_negative_triplets[idx]
        pos_img = Image.open(pos_img_path).convert('RGB')
        neg_img = Image.open(neg_img_path).convert('RGB')

        anchor_text = gloss_text(anchor_text)
        return (anchor_text, pos_img, neg_img)

In [52]:
def triplet_loss_cosine_similarity(anchor_embedding, positive_embedding, negative_embedding, margin=0.3):
    pos_sim = torch.nn.functional.cosine_similarity(anchor_embedding, positive_embedding)
    neg_sim = torch.nn.functional.cosine_similarity(anchor_embedding, negative_embedding)
    loss = torch.relu(margin + neg_sim - pos_sim).mean()
    return loss

In [53]:
def triplet_loss_euclidean_distance(anchor_embedding, positive_embedding, negative_embedding, margin=0.3):
    pos_dist = torch.nn.functional.pairwise_distance(anchor_embedding, positive_embedding, p=2)
    neg_dist = torch.nn.functional.pairwise_distance(anchor_embedding, negative_embedding, p=2)
    loss = torch.relu(pos_dist - neg_dist + margin).mean()
    return loss

In [54]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [55]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    #task_type=TaskType.FEATURE_EXTRACTION
)
model = get_peft_model(model, lora_config)

In [56]:
def dcg(relevances):
    relevances = np.asfarray(relevances)
    return relevances[0] + np.sum(relevances[1:] / np.log2(np.arange(2, relevances.size + 1) + 1))

In [57]:
def ndcg_score(ideal_ranking, predicted_ranking):
    image_to_relevance_score = {}
    
    for i in range(0, len(ideal_ranking)):
        image_to_relevance_score[ideal_ranking[i]] = len(ideal_ranking) - i 

    predicted_relevance = []
    ideal_relevance = []
    
    for index in range(0, len(ideal_ranking)):
        ideal_relevance.append(image_to_relevance_score[ideal_ranking[index]])
        predicted_relevance.append(image_to_relevance_score[predicted_ranking[index]])
    print(ideal_relevance)
    print(predicted_relevance)
    dcg_val = dcg(predicted_relevance)
    idcg_val = dcg(ideal_relevance)

    return dcg_val / idcg_val

In [58]:
def train_collate_fn(batch):
    texts = [item[0] for item in batch]
    pos_images = [item[1] for item in batch]
    neg_images = [item[2] for item in batch]

    inputs_pos = processor(text=texts, images=pos_images, return_tensors='pt', padding=True, truncation=True)
    inputs_neg = processor(text=texts, images=neg_images, return_tensors='pt', padding=True, truncation=True)
    return inputs_pos, inputs_neg

In [59]:
train_df = pd.read_csv("train/subtask_a_train.tsv", sep='\t')
dev_df = pd.read_csv("dev/subtask_a_dev.tsv", sep='\t')

In [None]:
from tqdm import tqdm

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

dataset = TripletDataset(train_df)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=train_collate_fn)
model.train()

for epoch in range(10):
    total_loss = 0
    pbar = tqdm(loader, desc=f"Epoch {epoch + 1}", leave=False)
    
    for inputs_pos, inputs_neg in pbar:
        outputs_pos = model(**inputs_pos)
        outputs_neg = model(**inputs_neg)

        anchor_emb = outputs_pos.text_embeds
        pos_emb = outputs_pos.image_embeds
        neg_emb = outputs_neg.image_embeds

        loss = triplet_loss_cosine_similarity(anchor_emb, pos_emb, neg_emb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch + 1} - Avg Loss: {avg_loss:.4f}")

Epoch 1:   0%|                                           | 0/44 [00:00<?, ?it/s]

In [None]:
def get_predicted_ranking(model, image_to_image_paths, text):
    predicted_ranking = []

    for image_name, image_path in image_to_image_paths.items():
        img = Image.open(image_path).convert('RGB')
        inputs = processor(text=text, images=img, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_ranking.append((image_name, 
                                  torch.nn.functional.cosine_similarity(outputs.text_embeds, 
                                                                        outputs.image_embeds).squeeze()))
    predicted_ranking = sorted(predicted_ranking, key=lambda x:x[1])
    predicted_ranking = [x[0] for x in predicted_ranking]
    
    return predicted_ranking

In [None]:
def calculate_ndcg_score(dev_df, model):
    scores = []
    for index, row in dev_df.iterrows():
        ideal_ranking = ast.literal_eval(row["expected_order"])
        text = row["sentence"]
        image_to_image_paths = {}

        for image_name in ideal_ranking:
            image_to_image_paths[image_name] = os.path.join("dev", 
                                                            row["compound"].replace("'s", "_s"), 
                                                            image_name)

        predicted_ranking = get_predicted_ranking(model, image_to_image_paths, text)
        score = ndcg_score(ideal_ranking, predicted_ranking)
        scores.append(score)

    return sum(scores)/len(scores)

In [None]:
calculate_ndcg_score(dev_df, model)

In [None]:
def calculate_1pc_accuracy(dev_df, model):
    correct = 0
    for index, row in dev_df.iterrows():
        ideal_ranking = ast.literal_eval(row["expected_order"])
        text = row["sentence"]
        image_to_image_paths = {}

        for image_name in ideal_ranking:
            image_to_image_paths[image_name] = os.path.join("dev", 
                                                            row["compound"].replace("'s", "_s"), 
                                                            image_name)

        predicted_ranking = get_predicted_ranking(model, image_to_image_paths, text)

        if ideal_ranking[0] == predicted_ranking[0]:
            correct += 1

    return correct/len(dev_df)

In [None]:
calculate_1pc_accuracy(dev_df, model)