# Parameters

In [2]:
# Choose model checkpoint
# NOTE: make sure CHOSEN_MODEL_CKPT exists in `model_configs` below

# CHOSEN_MODEL_CKPT = 'siamese_contrastive.pt'
# CHOSEN_MODEL_CKPT = 'siamese_contrastive_7k.pt'
# CHOSEN_MODEL_CKPT = 'siamese_contrastive_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt'
CHOSEN_MODEL_CKPT = 'siamese_contrastive_soft-neg_epoch=1_val-f1=0.829_val-pos-acc=0.802_val-neg-acc=0.932_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-f1-threshold=1.010.pt'

In [17]:
DATA_PATH = 'data/'

NAMES_AS_DESCRIPTIONS = False
SOURCE_FILE_NAME = 'tables_OZ_geo_5500/processed/OZ_geo_5500.csv'
IMG_DATASET_NAME = 'images_OZ_geo_5500'

In [23]:
import torch

DEVICE='cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 768 if torch.cuda.is_available() else 8
LIMIT_BATCHES = None if torch.cuda.is_available() else 2

In [24]:
# list available model configs

model_configs = [
    dict(
        MODEL_CKPT = 'siamese_contrastive_soft-neg_epoch=1_val-f1=0.829_val-pos-acc=0.802_val-neg-acc=0.932_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-f1-threshold=1.010.pt',
        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        CONTRASTIVE_THRESHOLD_BEST=1.01,
        DROPOUT=0.5
    ),

    dict(
        MODEL_CKPT = 'siamese_contrastive_pos_hard-neg_test-f1=0.520_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-threshold=0.19597989949748743.pt',
        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        CONTRASTIVE_THRESHOLD=0.3,
        DROPOUT=0.5
    ),

    dict(
        MODEL_CKPT = 'siamese_contrastive_pos_hard-neg_test-pos_acc=0.860_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-thr=1.085.pt',
        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        CONTRASTIVE_THRESHOLD=0.3,
        DROPOUT=0.5
    ),

    dict(
        MODEL_CKPT = 'siamese_contrastive_pos_hard-neg_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt',
        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        CONTRASTIVE_THRESHOLD=0.3,
    ),

    dict(
        MODEL_CKPT = 'siamese_contrastive.pt',
        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        CONTRASTIVE_THRESHOLD=0.3,
    ),

    dict(
        MODEL_CKPT = 'siamese_contrastive_7k.pt',
        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        CONTRASTIVE_THRESHOLD=0.3,
    ),

    # dict(
    #     MODEL_CKPT = 'siamese_contrastive_1gpu.pt',

    #     NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
    #     # NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',

    #     # DESCRIPTION_MODEL_NAME = 'sergeyzh/rubert-tiny-turbo',
    #     DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',

    #     CONTRASTIVE_THRESHOLD=0.3,
    # ),
]

# Create a dictionary of model configs
model_configs = {config['MODEL_CKPT']: config for config in model_configs}

In [6]:
print('Available models:')
list(model_configs.keys())

Available models:


['siamese_contrastive_soft-neg_epoch=1_val-f1=0.829_val-pos-acc=0.802_val-neg-acc=0.932_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-f1-threshold=1.010.pt',
 'siamese_contrastive_pos_hard-neg_test-f1=0.520_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-threshold=0.19597989949748743.pt',
 'siamese_contrastive_pos_hard-neg_test-pos_acc=0.860_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-thr=1.085.pt',
 'siamese_contrastive_pos_hard-neg_test-f1=0.301_splitting-by-query_cc12m_rubert_tiny_ep_1.pt',
 'siamese_contrastive.pt',
 'siamese_contrastive_7k.pt']

# Log into services

In [7]:
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [8]:
# Use tokens from .env

import os
from dotenv import load_dotenv

import huggingface_hub
import wandb

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
huggingface_hub.login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Imports

In [9]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"


from timm import create_model
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
from torch import optim, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
# from torchinfo import summary
import transformers
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer,\
        get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer

import cv2

from PIL import Image
from tqdm.auto import tqdm

import json
from itertools import product

# import datasets
# from datasets import Dataset, concatenate_datasets
import argparse
import requests

from io import BytesIO
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import more_itertools

# Source code

### RuCLIPtiny

In [10]:
class RuCLIPtiny(nn.Module):
    def __init__(self, name_model_name: str):
        """
        Initializes the RuCLIPtiny module using the provided name model.
        """
        super().__init__()
        self.visual = create_model('convnext_tiny',
                                   pretrained=False,  # set True if you want pretrained weights
                                   num_classes=0,
                                   in_chans=3)       # output: e.g. 768-dim features

        self.transformer = AutoModel.from_pretrained(name_model_name)
        name_model_output_size = self.transformer.config.hidden_size  # inferred dynamically
        self.final_ln = nn.Linear(name_model_output_size, 768)         # project to 768 dims
        self.logit_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07)))

    @property
    def dtype(self):
        return self.visual.stem[0].weight.dtype

    def encode_image(self, image: torch.Tensor) -> torch.Tensor:
        return self.visual(image.type(self.dtype))

    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        x = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        # use the CLS token (first token)
        x = x.last_hidden_state[:, 0, :]
        x = self.final_ln(x)
        return x

    def forward(self, image: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        image_features = self.encode_image(image)
        text_features = self.encode_text(input_ids, attention_mask)
        # Normalize features
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()
        return logits_per_image, logits_per_text


In [11]:
def get_transform():
    return transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        lambda image: image.convert("RGB"),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

class Tokenizers:
    def __init__(self, name_model_name: str, description_model_name: str):
        self.name_tokenizer = AutoTokenizer.from_pretrained(name_model_name)
        self.desc_tokenizer = AutoTokenizer.from_pretrained(description_model_name)

    def tokenize_name(self, texts, max_len=77):
        tokenized = self.name_tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])

    def tokenize_description(self, texts, max_len=77):
        tokenized = self.desc_tokenizer(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])



In [12]:
from transformers import AutoTokenizer
import torch

class NameTokenizer:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, texts, max_len=77):
        tokenized = self.tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])


class DescriptionTokenizer:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, texts, max_len=77):
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])


In [13]:
class SiameseRuCLIPDataset(torch.utils.data.Dataset):
    def __init__(self, images_dir: str, name_model_name: str, description_model_name: str, df=None, labels=None, df_path=None):
        """
        Dataset requires the concrete models' names for tokenization.
        """
        assert os.path.isdir(images_dir), f"Image dir does not exist: '{self.images_dir}'"

        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.labels = labels
        self.images_dir = images_dir
        self.tokenizers = Tokenizers(name_model_name, description_model_name)
        self.transform = get_transform()
        self.max_len = 77

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Tokenize names
        name_tokens = self.tokenizers.tokenize_name([str(row.name_first), str(row.name_second)], max_len=self.max_len)
        name_first = name_tokens[:, 0, :]  # [input_ids, attention_mask]
        name_second = name_tokens[:, 1, :]
        # Tokenize descriptions
        desc_tokens = self.tokenizers.tokenize_description([str(row.description_first), str(row.description_second)])
        desc_first = desc_tokens[:, 0, :]
        desc_second = desc_tokens[:, 1, :]
        # Process images
        im_first_path = os.path.join(self.images_dir, row.image_name_first)
        im_first = cv2.imread(im_first_path)
        im_first = cv2.cvtColor(im_first, cv2.COLOR_BGR2RGB)
        im_first = Image.fromarray(im_first)
        im_first = self.transform(im_first)
        im_second_path = os.path.join(self.images_dir, row.image_name_first)
        im_second = cv2.imread(os.path.join(im_second_path))
        im_second = cv2.cvtColor(im_second, cv2.COLOR_BGR2RGB)
        im_second = Image.fromarray(im_second)
        im_second = self.transform(im_second)
        label = self.labels[idx]
        return im_first, name_first, desc_first, im_second, name_second, desc_second, label

    def __len__(self):
        return len(self.df)

In [14]:
class RuCLIPDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        images_dir: str,
        name_model_name: str, description_model_name: str,
        df=None, labels=None, df_path=None,
        names_as_descriptions=False,
        return_sku=False,  # New parameter for backwards compatibility
    ):
        """
        Dataset requires the concrete models' names for tokenization.
        
        Args:
            return_sku: If True, __getitem__ returns (im, name, desc, sku)
                       If False, __getitem__ returns (im, name, desc) for backwards compatibility
        """
        assert os.path.isdir(images_dir), f"Image dir does not exist: '{images_dir}'"

        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.labels = labels
        self.images_dir = images_dir
        self.tokenizers = Tokenizers(name_model_name, description_model_name)
        self.transform = get_transform()
        self.max_len = 77
        self.names_as_descriptions = names_as_descriptions
        self.return_sku = return_sku  # Store the flag

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Tokenize name
        name_tokens = self.tokenizers.tokenize_name([str(row['name'])], max_len=self.max_len)
        name = name_tokens[:, 0, :]  # [input_ids, attention_mask]
        # print('name', row['name'])

        # Tokenize description
        if self.names_as_descriptions:
            desc = name
        else:
            # print('description', row.description)
            desc_tokens = self.tokenizers.tokenize_description([str(row.description)])
            desc = desc_tokens[:, 0, :]

        # Process image
        im_path = os.path.join(self.images_dir, row.image_name)
        im = cv2.imread(im_path)
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        im = Image.fromarray(im)
        im = self.transform(im)
        
        # Conditionally return SKU for backwards compatibility
        if self.return_sku:
            return im, name, desc, row['sku']
        else:
            return im, name, desc  # Original behavior

    def __len__(self):
        return len(self.df)

### SiameseRuCLIP

In [15]:
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

class SiameseRuCLIP(nn.Module):
    def __init__(self,
                 device: str,
                 name_model_name: str,
                 description_model_name: str,
                 preload_model_name: str = None,
                 models_dir: str = None,
                 dropout: float = None):
        """
        Initializes the SiameseRuCLIP model.
        Required parameters:
          - models_dir: directory containing saved checkpoints.
          - name_model_name: model name for text (name) branch.
          - description_model_name: model name for description branch.
        """
        super().__init__()
        device = torch.device(device)

        # Initialize RuCLIPtiny
        self.ruclip = RuCLIPtiny(name_model_name)
        if preload_model_name is not None:
            std = torch.load(
                os.path.join(models_dir, preload_model_name),
                weights_only=True,
                map_location=device
            )
            self.ruclip.load_state_dict(std)
            self.ruclip.eval()
        self.ruclip = self.ruclip.to(device)

        # Initialize the description transformer
        self.description_transformer = AutoModel.from_pretrained(description_model_name)
        self.description_transformer = self.description_transformer.to(device)

        # Determine dimensionality
        vision_dim = self.ruclip.visual.num_features
        name_dim = self.ruclip.final_ln.out_features
        desc_dim = self.description_transformer.config.hidden_size
        self.hidden_dim = vision_dim + name_dim + desc_dim
        self.dropout = dropout

        # Define MLP head with optional dropout
        layers = [
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.ReLU(),
            *( [nn.Dropout(self.dropout)] if self.dropout is not None else [] ),
            nn.Linear(self.hidden_dim // 2, self.hidden_dim // 4),
        ]
        self.head = nn.Sequential(*layers).to(device)


    def encode_image(self, image):
        return self.ruclip.encode_image(image)

    def encode_name(self, name):
        return self.ruclip.encode_text(name[:, 0, :], name[:, 1, :])

    def encode_description(self, desc):
        last_hidden_states = self.description_transformer(desc[:, 0, :], desc[:, 1, :]).last_hidden_state
        attention_mask = desc[:, 1, :]
        return average_pool(last_hidden_states, attention_mask)

    def get_final_embedding(self, im, name, desc):
        image_emb = self.encode_image(im)
        name_emb = self.encode_name(name)
        desc_emb = self.encode_description(desc)

        # Concatenate the embeddings and forward through the head
        combined_emb = torch.cat([image_emb, name_emb, desc_emb], dim=1)
        final_embedding = self.head(combined_emb)
        return final_embedding

    def forward(self, im1, name1, desc1, im2, name2, desc2):
        out1 = self.get_final_embedding(im1, name1, desc1)
        out2 = self.get_final_embedding(im2, name2, desc2)
        return out1, out2

# Evaluation loop

## Run evaluation

In [16]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"
LOCAL_DIR = Path("data/train_results")
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        f"train_results/{CHOSEN_MODEL_CKPT}",
        SOURCE_FILE_NAME,
        f"{IMG_DATASET_NAME}.zip"
    ],
)

!unzip -n -q data/{IMG_DATASET_NAME}.zip -d data/

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
# Load model

from pathlib import Path

def load_model(model_config):
    ckpt_name = model_config['MODEL_CKPT']
    model_ckpt_path = Path(DATA_PATH) / 'train_results' / ckpt_name
    std = torch.load(model_ckpt_path, map_location=DEVICE)

    # Initialize the model using the configuration.
    model = SiameseRuCLIP(
        name_model_name=model_config["NAME_MODEL_NAME"],
        description_model_name=model_config["DESCRIPTION_MODEL_NAME"],
        dropout=model_config['DROPOUT'],
        device=DEVICE,
    )

    model.load_state_dict(std)
    return model

model_config = model_configs[CHOSEN_MODEL_CKPT]  # choose a particular config for debugging
model = load_model(model_config)

In [19]:
# Load data

source_df = pd.read_csv(DATA_PATH + SOURCE_FILE_NAME)   
images_dir = DATA_PATH + IMG_DATASET_NAME

def load_data(model_config, names_as_descriptions=False, return_sku=False):
    test_ds = RuCLIPDataset(
        images_dir,
        model_config['NAME_MODEL_NAME'],
        model_config['DESCRIPTION_MODEL_NAME'],
        source_df,
        names_as_descriptions=names_as_descriptions,
        return_sku=return_sku  # New parameter
    )
    test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
    return test_dl

test_dl = load_data(
    model_config,
    names_as_descriptions = NAMES_AS_DESCRIPTIONS,
    return_sku=True
)

# # Get one batch from the dataloader
# images, names, descriptions = next(iter(test_dl))
# print(images.shape, names.shape, descriptions.shape)
# with torch.no_grad():
#     final_emb = model.get_final_embedding(images, names, descriptions)

In [20]:
import numpy as np
import torch

def compute_embeddings(
    model, dataloader, device=DEVICE, limit_batches=None, return_dataframe=False
):
    """
    Compute and return the embeddings for all samples in the dataloader.

    Args:
        model: The embedding model (must implement get_final_embedding).
        dataloader: DataLoader returning either (image, name, desc) or (image, name, desc, sku).
        device: The device to run the model on.
        limit_batches: Limit the number of batches to process.
        return_dataframe: If True and SKUs are available, return DataFrame with 'sku' and 'siamese_emb'.
                         If False, return numpy array (original behavior).

    Returns:
        pd.DataFrame or np.ndarray: Depending on return_dataframe and SKU availability.
    """
    all_final_embeddings = []
    all_skus = []
    has_skus = False

    model.eval()
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(dataloader)):
            if limit_batches is not None and batch_idx >= limit_batches:
                break

            # Handle both 3-tuple and 4-tuple returns
            if len(batch_data) == 4:
                images, names, descriptions, skus = batch_data
                has_skus = True
                all_skus.extend(skus.tolist())
            else:
                images, names, descriptions = batch_data

            images = images.to(device)
            names = names.to(device)
            descriptions = descriptions.to(device)

            final_emb = model.get_final_embedding(images, names, descriptions)
            all_final_embeddings.append(final_emb.cpu().numpy())

    final_embeddings = np.concatenate(all_final_embeddings, axis=0)
    
    # Return DataFrame if requested and SKUs are available
    if return_dataframe and has_skus:
        return pd.DataFrame({
            'sku': all_skus,
            'siamese_emb': [emb for emb in final_embeddings]
        })
    else:
        return final_embeddings  # Original behavior

In [25]:
emb_table = compute_embeddings(
    model,
    test_dl,
    return_dataframe=True,
    limit_batches=LIMIT_BATCHES,
)

emb_table.head()

  0%|          | 0/696 [00:00<?, ?it/s]

Unnamed: 0,sku,siamese_emb
0,1871769771,"[-0.36614314, 0.025540177, 0.007324349, -0.066..."
1,1679550303,"[-0.28556547, 0.037392773, -0.13363528, -0.022..."
2,1200553001,"[-0.16812354, 0.014725659, -0.062321663, -0.04..."
3,922231521,"[-0.11022419, 0.008129355, -0.025397489, 0.015..."
4,922230517,"[-0.19517846, 0.03094637, -0.11642145, -0.0274..."


In [27]:
from pathlib import Path

# Save to parquet (add this in the empty cell after emb_table.head())
file_dir = Path('embeddings/OZ_geo_5500')
file_name = f"siamese-embeddings_num-rows={len(emb_table)}.parquet"
full_file_path = Path(DATA_PATH) / file_dir / file_name
full_file_path.parent.mkdir(parents=True, exist_ok=True)

emb_table.to_parquet(full_file_path, index=False)
print(f"Saved embeddings to:\n{file_dir / file_name}")

Saved embeddings to:
embeddings/OZ_geo_5500/siamese-embeddings_num-rows=16.parquet


In [None]:
# Upload to HuggingFace
import os
from dotenv import load_dotenv
from huggingface_hub import HfApi, login

# Load HF_TOKEN from .env
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN not found in .env file")

# Log into HuggingFace
login(token=hf_token)

# Upload the folder
api = HfApi()
api.upload_folder(
    folder_path=DATA_PATH / file_dir,  # Path to the local directory
    path_in_repo=str(file_dir),
    repo_id="INDEEPA/clip-siamese",
    repo_type="dataset",
)