# Parameters

In [31]:
# LIMIT_BATCHES = None
LIMIT_BATCHES = 6 # cpu smoke run

MODEL_CFG_IDX = 0

In [None]:
DATA_PATH = 'data/'

TABLE_DATASET_FILE = 'OZ_geo_5700_no_descriptions.csv'
IMG_DATASET_NAME = 'images_OZ_geo_5700'

In [33]:
import torch

DEVICE='cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 256 if torch.cuda.is_available() else 8

In [34]:
MODEL_TYPE = "siamese-contrastive"

model_configs = [ 
    dict(
        MODEL_CKPT = 'siamese_contrastive.pt',

        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        # NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',

        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        # DESCRIPTION_MODEL_NAME = 'sergeyzh/rubert-tiny-turbo',

        CONTRASTIVE_THRESHOLD=0.3,
    ),
    
    dict(
        MODEL_CKPT = 'siamese_contrastive_7k.pt',

        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        # NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',

        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',

        CONTRASTIVE_THRESHOLD=0.3,
    ),
    
    
    # dict(
    #     MODEL_CKPT = 'siamese_contrastive_1gpu.pt',

    #     NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
    #     # NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',

    #     # DESCRIPTION_MODEL_NAME = 'sergeyzh/rubert-tiny-turbo',
    #     DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',

    #     CONTRASTIVE_THRESHOLD=0.3,
    # ),
]

# Log into services

In [35]:
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [36]:
# Use tokens from .env

import os
from dotenv import load_dotenv

import huggingface_hub
import wandb

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
huggingface_hub.login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Imports

In [37]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"


from timm import create_model
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
from torch import optim, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
# from torchinfo import summary
import transformers
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer,\
        get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer

import cv2

from PIL import Image
from tqdm.auto import tqdm

import json
from itertools import product

# import datasets
# from datasets import Dataset, concatenate_datasets
import argparse
import requests

from io import BytesIO
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score
import matplotlib.pyplot as plt
from IPython import display
import more_itertools

# Source code

### RuCLIPtiny

In [38]:
class RuCLIPtiny(nn.Module):
    def __init__(self, name_model_name: str):
        """
        Initializes the RuCLIPtiny module using the provided name model.
        """
        super().__init__()
        self.visual = create_model('convnext_tiny',
                                   pretrained=False,  # set True if you want pretrained weights
                                   num_classes=0,
                                   in_chans=3)       # output: e.g. 768-dim features
        
        self.transformer = AutoModel.from_pretrained(name_model_name)
        name_model_output_size = self.transformer.config.hidden_size  # inferred dynamically
        self.final_ln = nn.Linear(name_model_output_size, 768)         # project to 768 dims
        self.logit_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07)))
    
    @property
    def dtype(self):
        return self.visual.stem[0].weight.dtype

    def encode_image(self, image: torch.Tensor) -> torch.Tensor:
        return self.visual(image.type(self.dtype))

    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        x = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        # use the CLS token (first token)
        x = x.last_hidden_state[:, 0, :]
        x = self.final_ln(x)
        return x

    def forward(self, image: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        image_features = self.encode_image(image)
        text_features = self.encode_text(input_ids, attention_mask)
        # Normalize features
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()
        return logits_per_image, logits_per_text


In [39]:
def get_transform():
    return transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        lambda image: image.convert("RGB"),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

class Tokenizers:
    def __init__(self, name_model_name: str, description_model_name: str):
        self.name_tokenizer = AutoTokenizer.from_pretrained(name_model_name)
        self.desc_tokenizer = AutoTokenizer.from_pretrained(description_model_name)

    def tokenize_name(self, texts, max_len=77):
        tokenized = self.name_tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])

    def tokenize_description(self, texts, max_len=77):
        tokenized = self.desc_tokenizer(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])



In [40]:
from transformers import AutoTokenizer
import torch

class NameTokenizer:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, texts, max_len=77):
        tokenized = self.tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])


class DescriptionTokenizer:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, texts, max_len=77):
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])


In [41]:
class SiameseRuCLIPDataset(torch.utils.data.Dataset):
    def __init__(self, images_dir: str, name_model_name: str, description_model_name: str, df=None, labels=None, df_path=None):
        """
        Dataset requires the concrete models' names for tokenization.
        """
        assert os.path.isdir(images_dir), f"Image dir does not exist: '{self.images_dir}'"

        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.labels = labels
        self.images_dir = images_dir
        self.tokenizers = Tokenizers(name_model_name, description_model_name)
        self.transform = get_transform()
        self.max_len = 77

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Tokenize names
        name_tokens = self.tokenizers.tokenize_name([str(row.name_first), str(row.name_second)], max_len=self.max_len)
        name_first = name_tokens[:, 0, :]  # [input_ids, attention_mask]
        name_second = name_tokens[:, 1, :]
        # Tokenize descriptions
        desc_tokens = self.tokenizers.tokenize_description([str(row.description_first), str(row.description_second)])
        desc_first = desc_tokens[:, 0, :]
        desc_second = desc_tokens[:, 1, :]
        # Process images
        im_first_path = os.path.join(self.images_dir, row.image_name_first)
        im_first = cv2.imread(im_first_path)
        im_first = cv2.cvtColor(im_first, cv2.COLOR_BGR2RGB)
        im_first = Image.fromarray(im_first)
        im_first = self.transform(im_first)
        im_second_path = os.path.join(self.images_dir, row.image_name_first)
        im_second = cv2.imread(os.path.join(im_second_path))
        im_second = cv2.cvtColor(im_second, cv2.COLOR_BGR2RGB)
        im_second = Image.fromarray(im_second)
        im_second = self.transform(im_second)
        label = self.labels[idx]
        return im_first, name_first, desc_first, im_second, name_second, desc_second, label

    def __len__(self):
        return len(self.df)

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
import cv2

class RuCLIPDataset(torch.utils.data.Dataset):
    def __init__(self, images_dir: str, name_model_name: str, df=None, df_path=None):
        """
        Dataset for RuCLIP embeddings: returns (image, name) tuples.
        Only requires the name model for tokenization.
        """
        assert os.path.isdir(images_dir), f"Image dir does not exist: '{images_dir}'"

        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.images_dir = images_dir
        self.name_tokenizer = NameTokenizer(model_name=name_model_name)
        self.transform = get_transform()
        self.max_len = 77

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Tokenize name
        name_tokens = self.name_tokenizer.tokenize([str(row['name'])], max_len=self.max_len)
        name_input_ids = name_tokens[0, 0, :]  # Shape: (max_len,)
        name_attention_mask = name_tokens[1, 0, :]  # Shape: (max_len,)
        name = torch.stack([name_input_ids, name_attention_mask])  # Shape: (2, max_len)

        # Load and transform image
        image_path = os.path.join(self.images_dir, row.image_name)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        image = self.transform(image)

        return image, name

    def __len__(self):
        return len(self.df)


### SiameseRuCLIP

In [43]:
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

class SiameseRuCLIP(nn.Module):
    def __init__(self,
                 device: str,
                 name_model_name: str,
                 description_model_name: str,
                 models_dir: str = None,
                 preload_ruclip: bool = False,
                 preload_model_name: str = None):
        """
        Initializes the SiameseRuCLIP model.
        Required parameters:
          - models_dir: directory containing saved checkpoints.
          - name_model_name: model name for text (name) branch.
          - description_model_name: model name for description branch.
        """
        super().__init__()
        device = torch.device(device)

        # Initialize RuCLIPtiny
        self.ruclip = RuCLIPtiny(name_model_name)
        if preload_ruclip:
            std = torch.load(
                os.path.join(models_dir, preload_model_name),
                weights_only=True,
                map_location=device
            )
            self.ruclip.load_state_dict(std)
            self.ruclip.eval()
        self.ruclip = self.ruclip.to(device)

        # Initialize the description transformer
        self.description_transformer = AutoModel.from_pretrained(description_model_name)
        self.description_transformer = self.description_transformer.to(device)

        # Determine dimensionality
        vision_dim = self.ruclip.visual.num_features
        name_dim = self.ruclip.final_ln.out_features
        desc_dim = self.description_transformer.config.hidden_size
        self.hidden_dim = vision_dim + name_dim + desc_dim

        # Define MLP head
        self.head = nn.Sequential(
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim // 2, self.hidden_dim // 4),
        ).to(device)

    def encode_image(self, image):
        return self.ruclip.encode_image(image)

    def encode_name(self, name):
        return self.ruclip.encode_text(name[:, 0, :], name[:, 1, :])

    def encode_description(self, desc):
        last_hidden_states = self.description_transformer(desc[:, 0, :], desc[:, 1, :]).last_hidden_state
        attention_mask = desc[:, 1, :]
        return average_pool(last_hidden_states, attention_mask)

    def forward(self, im1, name1, desc1, im2, name2, desc2):
        image_emb1 = self.encode_image(im1)
        image_emb2 = self.encode_image(im2)
        name_emb1 = self.encode_name(name1)
        name_emb2 = self.encode_name(name2)
        desc_emb1 = self.encode_description(desc1)
        desc_emb2 = self.encode_description(desc2)

        first_emb = torch.cat([image_emb1, name_emb1, desc_emb1], dim=1)
        second_emb = torch.cat([image_emb2, name_emb2, desc_emb2], dim=1)

        out1 = self.head(first_emb)
        out2 = self.head(second_emb)
        return out1, out2


# Evaluation loop

## Run evaluation

In [44]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"
LOCAL_DIR = Path("data/train_results")
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        "train_results/siamese_contrastive*.pt",
        TABLE_DATASET_FILE,
        f"{IMG_DATASET_NAME}.zip"
    ],
)

!unzip -n -q data/{IMG_DATASET_NAME}.zip -d data/

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Load data

source_df = pd.read_csv(DATA_PATH + TABLE_DATASET_FILE)
images_dir = DATA_PATH + IMG_DATASET_NAME

def load_data(model_config):
    test_ds = RuCLIPDataset(
        images_dir,
        model_config['NAME_MODEL_NAME'], 
        source_df
    )
    test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
    return test_dl

model_config = model_configs[MODEL_CFG_IDX]  # choose a particular config for debugging
test_dl = load_data(model_config)

# # Get one batch from the dataloader
# images, names = next(iter(test_dl))

# # Move tensors to device
# images = images.to(DEVICE)
# names = names.to(DEVICE)

In [46]:
# Load model

from pathlib import Path

def load_model(model_config):
    ckpt_name = model_config['MODEL_CKPT']
    model_ckpt_path = Path(DATA_PATH) / 'train_results' / ckpt_name
    std = torch.load(model_ckpt_path, map_location=DEVICE)

    # Initialize the model using the configuration.
    model = SiameseRuCLIP(
        name_model_name=model_config["NAME_MODEL_NAME"],
        description_model_name=model_config["DESCRIPTION_MODEL_NAME"],
        device=DEVICE,
    )

    model.load_state_dict(std)
    return model

model = load_model(model_config)

# img_emb = model.encode_image(images)
# name_emb = model.encode_name(names)

In [None]:
import numpy as np
import torch

def compute_embeddings(model, dataloader, device=DEVICE, limit_batches=None):
    """
    Compute and return the embeddings for all samples in the dataloader, separately for images and names.

    Args:
        model: The embedding model (must implement encode_image and encode_name).
        dataloader: DataLoader returning (image, name) pairs.
        device: The device to run the model on.

    Returns:
        image_embeddings (np.ndarray): Embeddings for images.
        name_embeddings (np.ndarray): Embeddings for names.
    """
    all_image_embeddings = []
    all_name_embeddings = []
    all_descr_embeddings = []

    model.eval()
    with torch.no_grad():
        for images, names in tqdm(dataloader):
            if limit_batches is not None and len(all_image_embeddings) == limit_batches:
                break

            images = images.to(device)
            names = names.to(device)

            image_emb = model.encode_image(images)
            name_emb = model.encode_name(names)
            descr_emb = model.encode_description(names)

            all_image_embeddings.append(image_emb.cpu().numpy())
            all_name_embeddings.append(name_emb.cpu().numpy())
            all_descr_embeddings.append(descr_emb.cpu().numpy())

    image_embeddings = np.concatenate(all_image_embeddings, axis=0)
    name_embeddings = np.concatenate(all_name_embeddings, axis=0)
    all_descr_embeddings = np.concatenate(all_descr_embeddings, axis=0)
    
    return image_embeddings, name_embeddings, all_descr_embeddings

img_embs_all, name_embs_all, descr_embs_all = compute_embeddings(
    model, test_dl,
    limit_batches=LIMIT_BATCHES
)

img_embs_all.shape, name_embs_all.shape, descr_embs_all.shape

  0%|          | 0/713 [00:00<?, ?it/s]

((48, 768), (48, 768), (48, 312))

In [48]:
#@title find_top_k_similar

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_top_k_similar(query_embeddings, embedding_matrix, k=None, metric='euclidean-inverse', exclude_indices=None):
    """
    Find top-k similar items using Euclidean distance with inverse similarity.
    
    Args:
        query_embeddings (np.ndarray): Array of query embeddings, shape (batch, D).
        embedding_matrix (np.ndarray): Array of all embeddings, shape (N, D).
        k (int or None): Number of top matches to return, or None to return all valid sorted candidates.
        metric (str): Must be 'euclidean-inverse'. Other metrics raise NotImplementedError.
        exclude_indices (list, np.ndarray, or boolean mask, optional): Indices to exclude from search.

    Returns:
        top_k (np.ndarray): Indices of similar embeddings for each query, shape (batch, M).
        scores (np.ndarray): Corresponding similarity scores (0-1 range, higher=more similar), shape (batch, M).
    """
    if metric != 'euclidean-inverse':
        raise NotImplementedError(f"Metric '{metric}' is not implemented. Only 'euclidean-inverse' is supported.")
    
    # Convert exclude_indices to a boolean mask if needed
    mask = None
    if exclude_indices is not None:
        if isinstance(exclude_indices, (list, np.ndarray)):
            exclude_indices = np.array(exclude_indices)
            if exclude_indices.dtype != bool:
                mask = np.zeros(embedding_matrix.shape[0], dtype=bool)
                mask[exclude_indices] = True
            else:
                mask = exclude_indices
        else:
            raise ValueError("exclude_indices must be a list, np.ndarray, or boolean mask.")
    
    # Compute Euclidean distances: shape (batch, N)
    distances = np.linalg.norm(query_embeddings[:, None, :] - embedding_matrix[None, :, :], axis=2)
    
    # Set distances for excluded indices to +infinity so they are not selected
    if mask is not None:
        distances[:, mask] = np.inf
    
    # Convert distances to inverse similarities: similarity = 1 / (1 + distance)
    # Handle inf distances by setting their similarity to 0
    similarities = np.where(distances == np.inf, 0.0, 1.0 / (1.0 + distances))
    
    # Sort indices in descending order of similarity (higher similarity = better match)
    sorted_idx = np.argsort(-similarities, axis=1)
    
    if k is None:
        # Filter out zero similarities (from excluded indices) for each query
        valid_indices = []
        valid_scores = []
        for i in range(similarities.shape[0]):
            # Get indices where similarity is not 0 (i.e., not excluded)
            valid_mask = similarities[i] > 0
            valid_idx = np.where(valid_mask)[0]
            # Sort valid indices by similarity (descending)
            valid_idx = valid_idx[np.argsort(-similarities[i, valid_idx])]
            valid_indices.append(valid_idx)
            valid_scores.append(similarities[i, valid_idx])
        
        # Pad arrays to same length if needed
        if valid_indices:  # Check if we have any valid indices
            max_valid = max(len(x) for x in valid_indices)
            top_k = np.array([np.pad(x, (0, max_valid - len(x)), 'constant', constant_values=-1) for x in valid_indices])
            scores = np.array([np.pad(x, (0, max_valid - len(x)), 'constant', constant_values=0.0) for x in valid_scores])
        else:
            # Handle edge case where no valid indices exist
            top_k = np.array([]).reshape(similarities.shape[0], 0)
            scores = np.array([]).reshape(similarities.shape[0], 0)
    else:
        top_k = sorted_idx[:, :k]
        scores = np.take_along_axis(similarities, top_k, axis=1)
    
    return top_k, scores

In [None]:
# Load queries

QUERY_SELLER = 'ИНТЕРТРЕЙД'
query_idx_all = source_df[source_df['seller'] == QUERY_SELLER].index.to_numpy()

# Limit queries the actual number of embeddings
max_emb_cnt = img_embs_all.shape[0]
query_idx_all = query_idx_all[query_idx_all < max_emb_cnt]

# For a batch of query embeddings with shape (batch, 768)
query_img_embs = img_embs_all[query_idx_all]  # e.g., shape (3, 768)
query_name_embs = name_embs_all[query_idx_all]
query_descr_embs = descr_embs_all[query_idx_all]

In [50]:
# Find top-k matches for descr
top_k_descr, scores_descr = find_top_k_similar(
    query_descr_embs, descr_embs_all, k=30, metric='euclidean-inverse',
    exclude_indices=query_idx_all
)

print("Top-k image indices per query (shape):")
print(top_k_descr.shape)

# print("Corresponding similarity scores:")
# print(scores_descr)

Top-k image indices per query (shape):
(6, 30)


In [51]:
# Get all scores for images
_, scores_img = find_top_k_similar(
    query_img_embs, img_embs_all, k=None, metric='cosine',
    exclude_indices=query_idx_all
)

print("Image similarity scores (shape):")
print(scores_img.shape)

Image similarity scores (shape):
(6, 48)


In [52]:
# Get all scores for names
_, scores_name = find_top_k_similar(
    query_name_embs, name_embs_all, k=None, metric='cosine',
    exclude_indices=query_idx_all
)

print("Name similarity scores (shape):")
print(scores_name.shape)

Name similarity scores (shape):
(6, 48)


In [53]:
# Get combined scores of image + name

# Define weights (they should sum to 1)
weight_name = 0.7  # weight for name scores
weight_img = 1 - weight_name   # weight for image scores

# Compute the linear combination
combined_scores = weight_name * scores_name + weight_img * scores_img

print("Combined scores shape:", combined_scores.shape)

Combined scores shape: (6, 48)


In [54]:
# Filter combined scores by top-k from description

filtered_combined_scores = np.take_along_axis(combined_scores, top_k_descr, axis=1)
filtered_combined_scores.shape

(6, 30)

In [55]:
# Take top-k within the filtered subset
FINAL_TOP_K = 5

# Compute the relative indices of top-k candidates from the filtered combined scores.
relative_top_k = np.argsort(-filtered_combined_scores, axis=1)[:, :FINAL_TOP_K]
relative_top_k.shape

(6, 5)

In [56]:
# Map back relative final top-k to original final top-k
final_scores = np.take_along_axis(filtered_combined_scores, relative_top_k, axis=1)
final_top_k = np.take_along_axis(top_k_descr, relative_top_k, axis=1)
print(final_top_k.shape, final_scores.shape)
print(final_top_k)
print(final_scores)

(6, 5) (6, 5)
[[ 0  2  3  4  5]
 [ 7  9 10 11 12]
 [ 6  7  8  9 10]
 [ 7  9 10 11 12]
 [ 9 10 11 12 13]
 [ 3  5  6  7  9]]
[[0.98533237 0.9757167  0.9734     0.9684659  0.96020365]
 [0.97292763 0.97076315 0.9707185  0.9705864  0.9704597 ]
 [0.75158215 0.7397786  0.652728   0.6320817  0.62593603]
 [0.98445165 0.9772097  0.97649187 0.9720524  0.9679757 ]
 [0.97400707 0.97213286 0.97202116 0.96628004 0.9623976 ]
 [0.9813541  0.97896326 0.9784416  0.9777859  0.9756112 ]]


In [None]:
results = []
for query_idx, top_k_indices_per_query, scores_per_query in zip(
    query_idx_all, final_top_k, final_scores
):
    query_sku = source_df.loc[query_idx]['sku']
    top_k_skus = source_df.loc[top_k_indices_per_query]['sku']

    top_k_sku_results = {
        f'Top-{rank_idx + 1} SKU': sku
        for rank_idx, sku in enumerate(top_k_skus)
    }

    top_k_score_results = {
        f'Top-{rank_idx + 1} Score': score
        for rank_idx, score in enumerate(scores_per_query.tolist())
    }
    results.append({
        'SKU Query': query_sku,
        **top_k_sku_results,
        **top_k_score_results
    })

results = pd.DataFrame(results)
results.head()

Unnamed: 0,SKU Query,Top-1 SKU,Top-2 SKU,Top-3 SKU,Top-4 SKU,Top-5 SKU,Top-1 Score,Top-2 Score,Top-3 Score,Top-4 Score,Top-5 Score
0,491279127,936454663,844750071,1737112880,216810859,861723214,0.985332,0.975717,0.9734,0.968466,0.960204
1,491270369,438166622,861593242,490685952,861605997,1713026634,0.972928,0.970763,0.970719,0.970586,0.97046
2,922230517,268682139,438166622,1712946829,861593242,490685952,0.751582,0.739779,0.652728,0.632082,0.625936
3,491273791,438166622,861593242,490685952,861605997,1713026634,0.984452,0.97721,0.976492,0.972052,0.967976
4,508611672,861593242,490685952,861605997,1713026634,1422373846,0.974007,0.972133,0.972021,0.96628,0.962398


In [79]:
used_entries = img_embs_all.shape[0]

file_name = (
    f'Карты_мира_Озон_'
    f'всего={used_entries}_'
    f'top-{FINAL_TOP_K}_'
    f'Seller={QUERY_SELLER}_'
    f"модель={model_config['MODEL_CKPT']}_"
    f"image-weight={round(weight_img, 2)}"
    '.csv'
)

file_path = Path(DATA_PATH) / file_name

print("Output to:")
print(file_path)

results.to_csv(file_path)

Output to:
data/Карты_мира_Озон_всего=48_top-5_Seller=ИНТЕРТРЕЙД_модель=siamese_contrastive.pt_image-weight=0.3.csv


In [None]:
# results.head()

Unnamed: 0,SKU Query,Top-1 SKU,Top-2 SKU,Top-3 SKU,Top-4 SKU,Top-5 SKU,Top-1 Score,Top-2 Score,Top-3 Score,Top-4 Score,Top-5 Score
0,491279127,936454663,844750071,1737112880,216810859,861723214,0.985332,0.975717,0.9734,0.968466,0.960204
1,491270369,438166622,861593242,490685952,861605997,1713026634,0.972928,0.970763,0.970719,0.970586,0.97046
2,922230517,268682139,438166622,1712946829,861593242,490685952,0.751582,0.739779,0.652728,0.632082,0.625936
3,491273791,438166622,861593242,490685952,861605997,1713026634,0.984452,0.97721,0.976492,0.972052,0.967976
4,508611672,861593242,490685952,861605997,1713026634,1422373846,0.974007,0.972133,0.972021,0.96628,0.962398
5,922231521,1737112880,861723214,268682139,438166622,861593242,0.981354,0.978963,0.978442,0.977786,0.975611


# Merge to original df

In [64]:
source_df[['sku', 'url']].head()

Unnamed: 0,SKU,URL
0,936454663,https://www.ozon.ru/context/detail/id/936454663/
1,491279127,https://www.ozon.ru/context/detail/id/491279127/
2,844750071,https://www.ozon.ru/context/detail/id/844750071/
3,1737112880,https://www.ozon.ru/context/detail/id/1737112880/
4,216810859,https://www.ozon.ru/context/detail/id/216810859/


In [None]:
# # Read results manually

# # results_file_path = 'data/Карты_мира_Озон_всего=5703_top-5_Seller=ИНТЕРТРЕЙД_модель=siamese_contrastive_7k.pt_name-weight=0.7.csv'
# results_file_path = 'data/Карты_мира_Озон_всего=5703_top-5_Seller=ИНТЕРТРЕЙД_модель=siamese_contrastive.pt_name-weight=0.7.csv'

# results = pd.read_csv(results_file_path, index_col=0)

In [110]:
# Assume 'results' has columns: "SKU Query", "Top-1", "Top-2", ..., "Top-k"
k = 5

final_df = (
    source_df[['sku', 'url']]
    .rename(columns={'sku': 'SKU Query', 'url': 'Query URL'})
    .merge(results, on='SKU Query', how='right')
)

for i in range(1, k+1):
    sku_col = f"Top-{i} SKU"
    url_col = f"Top-{i} URL"
    final_df = final_df.merge(
        source_df[['sku', 'url']].rename(columns={'sku': sku_col, 'url': url_col}),
        on=sku_col,
        how='left'
    )

cols_order = [
    "SKU Query",
    "Top-1 SKU", "Top-2 SKU", "Top-3 SKU", "Top-4 SKU", "Top-5 SKU",
    "Top-1 Score", "Top-2 Score", "Top-3 Score", "Top-4 Score", "Top-5 Score",
    "Query URL",
    "Top-1 URL", "Top-2 URL", "Top-3 URL", "Top-4 URL", "Top-5 URL"
]
final_df = final_df[cols_order]

final_df.head()

Unnamed: 0,SKU Query,Top-1 SKU,Top-2 SKU,Top-3 SKU,Top-4 SKU,Top-5 SKU,Top-1 Score,Top-2 Score,Top-3 Score,Top-4 Score,Top-5 Score,Query URL,Top-1 URL,Top-2 URL,Top-3 URL,Top-4 URL,Top-5 URL
0,491279127,844750071,1737112880,216810859,861723214,268682139,0.976887,0.974613,0.970675,0.969757,0.966751,https://www.ozon.ru/context/detail/id/491279127/,https://www.ozon.ru/context/detail/id/844750071/,https://www.ozon.ru/context/detail/id/1737112880/,https://www.ozon.ru/context/detail/id/216810859/,https://www.ozon.ru/context/detail/id/861723214/,https://www.ozon.ru/context/detail/id/268682139/
1,491270369,861593242,490685952,861605997,1713026634,1422373846,0.971047,0.971002,0.970893,0.970808,0.970334,https://www.ozon.ru/context/detail/id/491270369/,https://www.ozon.ru/context/detail/id/861593242/,https://www.ozon.ru/context/detail/id/490685952/,https://www.ozon.ru/context/detail/id/861605997/,https://www.ozon.ru/context/detail/id/1713026634/,https://www.ozon.ru/context/detail/id/1422373846/
2,922230517,1422373846,499509545,950549378,844770867,536896417,0.932672,0.925902,0.925869,0.924192,0.92249,https://www.ozon.ru/context/detail/id/922230517/,https://www.ozon.ru/context/detail/id/1422373846/,https://www.ozon.ru/context/detail/id/499509545/,https://www.ozon.ru/context/detail/id/950549378/,https://www.ozon.ru/context/detail/id/844770867/,https://www.ozon.ru/context/detail/id/536896417/
3,491273791,499509545,950549378,844770867,536896417,1422369500,0.971599,0.971397,0.970715,0.96995,0.966699,https://www.ozon.ru/context/detail/id/491273791/,https://www.ozon.ru/context/detail/id/499509545/,https://www.ozon.ru/context/detail/id/950549378/,https://www.ozon.ru/context/detail/id/844770867/,https://www.ozon.ru/context/detail/id/536896417/,https://www.ozon.ru/context/detail/id/1422369500/
4,508611672,861605997,950549378,844770867,951285479,1422369500,0.974237,0.971716,0.969571,0.96552,0.964504,https://www.ozon.ru/context/detail/id/508611672/,https://www.ozon.ru/context/detail/id/861605997/,https://www.ozon.ru/context/detail/id/950549378/,https://www.ozon.ru/context/detail/id/844770867/,https://www.ozon.ru/context/detail/id/951285479/,https://www.ozon.ru/context/detail/id/1422369500/


In [111]:
# import re
# url_cols = [col for col in final_df.columns if re.search('url', col)]
# final_df[url_cols]

In [112]:
final_df_file_name = Path(results_file_path).stem + '_URL-included.csv'
print(final_df_file_name)

final_df.to_csv(Path(DATA_PATH) / final_df_file_name)

Карты_мира_Озон_всего=5703_top-5_Seller=ИНТЕРТРЕЙД_модель=siamese_contrastive.pt_name-weight=0.7_URL-included.csv


In [114]:
final_df.loc[18]

SKU Query                                              507113963
Top-1 SKU                                             1706808786
Top-2 SKU                                             1550235173
Top-3 SKU                                             1709679383
Top-4 SKU                                              553013273
Top-5 SKU                                             1233218843
Top-1 Score                                             0.726337
Top-2 Score                                             0.722856
Top-3 Score                                             0.722841
Top-4 Score                                             0.721007
Top-5 Score                                             0.720955
Query URL       https://www.ozon.ru/context/detail/id/507113963/
Top-1 URL      https://www.ozon.ru/context/detail/id/1706808786/
Top-2 URL      https://www.ozon.ru/context/detail/id/1550235173/
Top-3 URL      https://www.ozon.ru/context/detail/id/1709679383/
Top-4 URL       https://w