# Parameters

In [38]:
# LIMIT_BATCHES = None
LIMIT_BATCHES = 6 # cpu smoke run

MODEL_CFG_IDX = 1

In [None]:
DATA_PATH = 'data/'

# TABLE_DATASET_FILE = 'tables_OZ_geo_5700/OZ_geo_5700_no_descriptions.csv'
# IMG_DATASET_NAME = 'images_OZ_geo_5700'

TABLE_DATASET_FILE = 'tables_OZ_geo_5500/OZ_geo_5500.csv'
IMG_DATASET_NAME = 'images_OZ_geo_5500'

In [82]:
import torch

DEVICE='cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 768 if torch.cuda.is_available() else 8

In [83]:
MODEL_TYPE = "siamese-contrastive"

model_configs = [
    dict(
        MODEL_CKPT = 'siamese_contrastive.pt',

        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        # NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',

        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
        # DESCRIPTION_MODEL_NAME = 'sergeyzh/rubert-tiny-turbo',

        CONTRASTIVE_THRESHOLD=0.3,
    ),

    dict(
        MODEL_CKPT = 'siamese_contrastive_7k.pt',

        NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
        # NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',

        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',

        CONTRASTIVE_THRESHOLD=0.3,
    ),


    # dict(
    #     MODEL_CKPT = 'siamese_contrastive_1gpu.pt',

    #     NAME_MODEL_NAME = 'cointegrated/rubert-tiny',
    #     # NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',

    #     # DESCRIPTION_MODEL_NAME = 'sergeyzh/rubert-tiny-turbo',
    #     DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',

    #     CONTRASTIVE_THRESHOLD=0.3,
    # ),
]

# Log into services

In [84]:
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [86]:
# Use tokens from .env

import os
from dotenv import load_dotenv

import huggingface_hub
import wandb

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
huggingface_hub.login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Imports

In [87]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"


from timm import create_model
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
from torch import optim, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
# from torchinfo import summary
import transformers
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer,\
        get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer

import cv2

from PIL import Image
from tqdm.auto import tqdm

import json
from itertools import product

# import datasets
# from datasets import Dataset, concatenate_datasets
import argparse
import requests

from io import BytesIO
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import more_itertools

# Source code

### RuCLIPtiny

In [88]:
class RuCLIPtiny(nn.Module):
    def __init__(self, name_model_name: str):
        """
        Initializes the RuCLIPtiny module using the provided name model.
        """
        super().__init__()
        self.visual = create_model('convnext_tiny',
                                   pretrained=False,  # set True if you want pretrained weights
                                   num_classes=0,
                                   in_chans=3)       # output: e.g. 768-dim features

        self.transformer = AutoModel.from_pretrained(name_model_name)
        name_model_output_size = self.transformer.config.hidden_size  # inferred dynamically
        self.final_ln = nn.Linear(name_model_output_size, 768)         # project to 768 dims
        self.logit_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07)))

    @property
    def dtype(self):
        return self.visual.stem[0].weight.dtype

    def encode_image(self, image: torch.Tensor) -> torch.Tensor:
        return self.visual(image.type(self.dtype))

    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        x = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        # use the CLS token (first token)
        x = x.last_hidden_state[:, 0, :]
        x = self.final_ln(x)
        return x

    def forward(self, image: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        image_features = self.encode_image(image)
        text_features = self.encode_text(input_ids, attention_mask)
        # Normalize features
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()
        return logits_per_image, logits_per_text


In [89]:
def get_transform():
    return transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        lambda image: image.convert("RGB"),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

class Tokenizers:
    def __init__(self, name_model_name: str, description_model_name: str):
        self.name_tokenizer = AutoTokenizer.from_pretrained(name_model_name)
        self.desc_tokenizer = AutoTokenizer.from_pretrained(description_model_name)

    def tokenize_name(self, texts, max_len=77):
        tokenized = self.name_tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])

    def tokenize_description(self, texts, max_len=77):
        tokenized = self.desc_tokenizer(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])



In [90]:
from transformers import AutoTokenizer
import torch

class NameTokenizer:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, texts, max_len=77):
        tokenized = self.tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])


class DescriptionTokenizer:
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, texts, max_len=77):
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])


In [91]:
class SiameseRuCLIPDataset(torch.utils.data.Dataset):
    def __init__(self, images_dir: str, name_model_name: str, description_model_name: str, df=None, labels=None, df_path=None):
        """
        Dataset requires the concrete models' names for tokenization.
        """
        assert os.path.isdir(images_dir), f"Image dir does not exist: '{self.images_dir}'"

        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.labels = labels
        self.images_dir = images_dir
        self.tokenizers = Tokenizers(name_model_name, description_model_name)
        self.transform = get_transform()
        self.max_len = 77

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Tokenize names
        name_tokens = self.tokenizers.tokenize_name([str(row.name_first), str(row.name_second)], max_len=self.max_len)
        name_first = name_tokens[:, 0, :]  # [input_ids, attention_mask]
        name_second = name_tokens[:, 1, :]
        # Tokenize descriptions
        desc_tokens = self.tokenizers.tokenize_description([str(row.description_first), str(row.description_second)])
        desc_first = desc_tokens[:, 0, :]
        desc_second = desc_tokens[:, 1, :]
        # Process images
        im_first_path = os.path.join(self.images_dir, row.image_name_first)
        im_first = cv2.imread(im_first_path)
        im_first = cv2.cvtColor(im_first, cv2.COLOR_BGR2RGB)
        im_first = Image.fromarray(im_first)
        im_first = self.transform(im_first)
        im_second_path = os.path.join(self.images_dir, row.image_name_first)
        im_second = cv2.imread(os.path.join(im_second_path))
        im_second = cv2.cvtColor(im_second, cv2.COLOR_BGR2RGB)
        im_second = Image.fromarray(im_second)
        im_second = self.transform(im_second)
        label = self.labels[idx]
        return im_first, name_first, desc_first, im_second, name_second, desc_second, label

    def __len__(self):
        return len(self.df)

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
import cv2

class RuCLIPDataset(torch.utils.data.Dataset):
    def __init__(self, images_dir: str, name_model_name: str, df=None, df_path=None):
        """
        Dataset for RuCLIP embeddings: returns (image, name) tuples.
        Only requires the name model for tokenization.
        """
        assert os.path.isdir(images_dir), f"Image dir does not exist: '{images_dir}'"

        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.images_dir = images_dir
        self.name_tokenizer = NameTokenizer(model_name=name_model_name)
        self.transform = get_transform()
        self.max_len = 77

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Tokenize name
        name_tokens = self.name_tokenizer.tokenize([str(row['name'])], max_len=self.max_len)
        name_input_ids = name_tokens[0, 0, :]  # Shape: (max_len,)
        name_attention_mask = name_tokens[1, 0, :]  # Shape: (max_len,)
        name = torch.stack([name_input_ids, name_attention_mask])  # Shape: (2, max_len)

        # Load and transform image
        image_path = os.path.join(self.images_dir, row.image_name)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        image = self.transform(image)

        return image, name

    def __len__(self):
        return len(self.df)


### SiameseRuCLIP

In [93]:
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

class SiameseRuCLIP(nn.Module):
    def __init__(self,
                 device: str,
                 name_model_name: str,
                 description_model_name: str,
                 models_dir: str = None,
                 preload_ruclip: bool = False,
                 preload_model_name: str = None):
        """
        Initializes the SiameseRuCLIP model.
        Required parameters:
          - models_dir: directory containing saved checkpoints.
          - name_model_name: model name for text (name) branch.
          - description_model_name: model name for description branch.
        """
        super().__init__()
        device = torch.device(device)

        # Initialize RuCLIPtiny
        self.ruclip = RuCLIPtiny(name_model_name)
        if preload_ruclip:
            std = torch.load(
                os.path.join(models_dir, preload_model_name),
                weights_only=True,
                map_location=device
            )
            self.ruclip.load_state_dict(std)
            self.ruclip.eval()
        self.ruclip = self.ruclip.to(device)

        # Initialize the description transformer
        self.description_transformer = AutoModel.from_pretrained(description_model_name)
        self.description_transformer = self.description_transformer.to(device)

        # Determine dimensionality
        vision_dim = self.ruclip.visual.num_features
        name_dim = self.ruclip.final_ln.out_features
        desc_dim = self.description_transformer.config.hidden_size
        self.hidden_dim = vision_dim + name_dim + desc_dim

        # Define MLP head
        self.head = nn.Sequential(
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim // 2, self.hidden_dim // 4),
        ).to(device)

    def encode_image(self, image):
        return self.ruclip.encode_image(image)

    def encode_name(self, name):
        return self.ruclip.encode_text(name[:, 0, :], name[:, 1, :])

    def encode_description(self, desc):
        last_hidden_states = self.description_transformer(desc[:, 0, :], desc[:, 1, :]).last_hidden_state
        attention_mask = desc[:, 1, :]
        return average_pool(last_hidden_states, attention_mask)

    def get_final_embedding(self, im, name, desc):
        image_emb = self.encode_image(im)
        name_emb = self.encode_name(name)
        desc_emb = self.encode_description(desc)

        # Concatenate the embeddings and forward through the head
        combined_emb = torch.cat([image_emb, name_emb, desc_emb], dim=1)
        final_embedding = self.head(combined_emb)
        return final_embedding

    def forward(self, im1, name1, desc1, im2, name2, desc2):
        out1 = self.get_final_embedding(im1, name1, desc1)
        out2 = self.get_final_embedding(im2, name2, desc2)
        return out1, out2

# Evaluation loop

## Run evaluation

In [95]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"
LOCAL_DIR = Path("data/train_results")
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        "train_results/siamese_contrastive*.pt",
        TABLE_DATASET_FILE,
        f"{IMG_DATASET_NAME}.zip"
    ],
)

!unzip -o -q data/{IMG_DATASET_NAME}.zip -d data/

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[data/images_OZ_geo_5500.zip]
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of data/images_OZ_geo_5500.zip or
        data/images_OZ_geo_5500.zip.zip, and cannot find data/images_OZ_geo_5500.zip.ZIP, period.


In [99]:
# Load data

source_df = pd.read_csv(DATA_PATH + TABLE_DATASET_FILE)
images_dir = DATA_PATH + IMG_DATASET_NAME

def load_data(model_config):
    test_ds = RuCLIPDataset(
        images_dir,
        model_config['NAME_MODEL_NAME'],
        source_df
    )
    test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
    return test_dl

model_config = model_configs[MODEL_CFG_IDX]  # choose a particular config for debugging
test_dl = load_data(model_config)

# Get one batch from the dataloader
images, names = next(iter(test_dl))

# Move tensors to device
images = images.to(DEVICE)
names = names.to(DEVICE)

In [100]:
# Load model

from pathlib import Path

def load_model(model_config):
    ckpt_name = model_config['MODEL_CKPT']
    model_ckpt_path = Path(DATA_PATH) / 'train_results' / ckpt_name
    std = torch.load(model_ckpt_path, map_location=DEVICE)

    # Initialize the model using the configuration.
    model = SiameseRuCLIP(
        name_model_name=model_config["NAME_MODEL_NAME"],
        description_model_name=model_config["DESCRIPTION_MODEL_NAME"],
        device=DEVICE,
    )

    model.load_state_dict(std)
    return model

model = load_model(model_config)
# final_emb = model.get_final_embedding(images, names, names)

In [None]:
import numpy as np
import torch

def compute_embeddings(model, dataloader, device=DEVICE, limit_batches=None):
    """
    Compute and return the embeddings for all samples in the dataloader, separately for images and names.

    Args:
        model: The embedding model (must implement encode_image and encode_name).
        dataloader: DataLoader returning (image, name) pairs.
        device: The device to run the model on.

    Returns:
        final_embeddings (np.ndarray): Embeddings for images.
        name_embeddings (np.ndarray): Embeddings for names.
    """
    all_final_embeddings = []

    model.eval()
    with torch.no_grad():
        for images, names in tqdm(dataloader):
            if limit_batches is not None and len(all_final_embeddings) == limit_batches:
                break

            images = images.to(device)
            names = names.to(device)

            final_emb = model.get_final_embedding(images, names, names)

            all_final_embeddings.append(final_emb.cpu().numpy())

    final_embeddings = np.concatenate(all_final_embeddings, axis=0)

    return final_embeddings

final_embs_all = compute_embeddings(
    model, test_dl,
    limit_batches=LIMIT_BATCHES
)

final_embs_all.shape

  0%|          | 0/696 [00:00<?, ?it/s]

(48, 462)

In [102]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_top_k_similar(query_embeddings, embedding_matrix, k=5, metric='cosine', exclude_indices=None):
    """
    Find top-k similar items for each query embedding in a batch, with the option to exclude certain indices.
    If k is None, returns all indices sorted by similarity.

    Args:
        query_embeddings (np.ndarray): Array of query embeddings, shape (batch, D).
        embedding_matrix (np.ndarray): Array of all embeddings, shape (N, D).
        k (int or None): Number of top matches to return, or None to return all sorted candidates.
        metric (str): 'cosine' or 'euclidean'.
        exclude_indices (list, np.ndarray, or boolean mask, optional): Indices to exclude from search.
            If a list/array of indices is provided, it will be converted to a boolean mask.
            If a boolean mask is provided, it must have shape (N,).

    Returns:
        top_k (np.ndarray): Indices of similar embeddings for each query, shape (batch, M) where
                            M == k (or M == number of valid candidates if k is None).
        scores (np.ndarray): Corresponding similarity scores (or negated distances) for each query,
                             shape (batch, M).
    """
    # Convert exclude_indices to a boolean mask if needed
    if exclude_indices is not None:
        if isinstance(exclude_indices, (list, np.ndarray)):
            exclude_indices = np.array(exclude_indices)
            if exclude_indices.dtype != bool:
                mask = np.zeros(embedding_matrix.shape[0], dtype=bool)
                mask[exclude_indices] = True
            else:
                mask = exclude_indices
        else:
            raise ValueError("exclude_indices must be a list, np.ndarray, or boolean mask.")
    else:
        mask = None

    if metric == 'cosine':
        # Compute cosine similarities for the entire batch (shape: (batch, N))
        similarities = cosine_similarity(query_embeddings, embedding_matrix)
        # Set similarities for excluded indices to -infinity so they are not selected.
        if mask is not None:
            similarities[:, mask] = -np.inf
        # Sort indices in descending order of similarity.
        sorted_idx = np.argsort(-similarities, axis=1)
        if k is None:
            top_k = sorted_idx
            scores = np.take_along_axis(similarities, top_k, axis=1)
        else:
            top_k = sorted_idx[:, :k]
            scores = np.take_along_axis(similarities, top_k, axis=1)
    elif metric == 'euclidean':
        # Compute Euclidean distances: shape (batch, N)
        distances = np.linalg.norm(query_embeddings[:, None, :] - embedding_matrix[None, :, :], axis=2)
        # Set distances for excluded indices to +infinity so they are not selected.
        if mask is not None:
            distances[:, mask] = np.inf
        # Sort indices in ascending order of distance.
        sorted_idx = np.argsort(distances, axis=1)
        if k is None:
            top_k = sorted_idx
            # Negate distances so that higher scores correspond to closer matches.
            scores = -np.take_along_axis(distances, top_k, axis=1)
        else:
            top_k = sorted_idx[:, :k]
            scores = -np.take_along_axis(distances, top_k, axis=1)
    else:
        raise ValueError("Unsupported metric: choose 'cosine' or 'euclidean'")

    return top_k, scores


In [103]:
# Load queries

QUERY_SELLER = 'ИНТЕРТРЕЙД'
query_idx_all = source_df[source_df['Seller'] == QUERY_SELLER].index.to_numpy()

# Limit queries the actual number of embeddings
max_emb_cnt = final_embs_all.shape[0]
query_idx_all = query_idx_all[query_idx_all < max_emb_cnt]

# For a batch of query embeddings with shape (batch, 768)
query_final_embs = final_embs_all[query_idx_all]  # e.g., shape (3, 768)

In [104]:
FINAL_TOP_K = 5

# Find top-k matches for final
top_k_final, scores_final = find_top_k_similar(
    query_final_embs, final_embs_all, k=FINAL_TOP_K, metric='cosine',
    exclude_indices=query_idx_all
)

print("Top-k image indices per query (shape):")
print(top_k_final.shape)

# print("Corresponding similarity scores:")
# print(scores_final)

Top-k image indices per query (shape):
(8, 5)


In [105]:
results = []
for query_idx, top_k_indices_per_query, scores_per_query in zip(
    query_idx_all, top_k_final, scores_final
):
    query_sku = source_df.loc[query_idx]['SKU']
    top_k_skus = source_df.loc[top_k_indices_per_query]['SKU']

    top_k_sku_results = {
        f'Top-{rank_idx + 1} SKU': sku
        for rank_idx, sku in enumerate(top_k_skus)
    }

    top_k_score_results = {
        f'Top-{rank_idx + 1} Score': score
        for rank_idx, score in enumerate(scores_per_query.tolist())
    }
    results.append({
        'Query SKU': query_sku,
        **top_k_sku_results,
        **top_k_score_results
    })

results = pd.DataFrame(results)
results.head()

Unnamed: 0,Query SKU,Top-1 SKU,Top-2 SKU,Top-3 SKU,Top-4 SKU,Top-5 SKU,Top-1 Score,Top-2 Score,Top-3 Score,Top-4 Score,Top-5 Score
0,1871769771,1947327095,1947160810,1953623209,1949798600,1957134593,0.90107,0.505567,0.494488,0.480659,0.470711
1,1679550303,1949798600,1953623209,1941834406,1957134593,1947160810,0.798799,0.758761,0.75816,0.741988,0.734876
2,1200553001,1947327095,1953623209,1949798600,1947160810,1941834406,0.925983,0.421462,0.41819,0.401565,0.349902
3,922231521,1947327095,1953623209,1949798600,1947160810,1941834406,0.938452,0.665332,0.663334,0.650396,0.620969
4,922230517,1947327095,1949798600,1953623209,1947160810,1941834406,0.903337,0.537118,0.536721,0.503074,0.463418


In [143]:
used_entries = final_embs_all.shape[0]

file_name = (
    f'Карты_мира_Озон_'
    f'всего={used_entries}_'
    f'top-{FINAL_TOP_K}_'
    f'Seller={QUERY_SELLER}_'
    f"модель={model_config['MODEL_CKPT']}_"
    f"эмбеддинги=final"
    '.csv'
)

results_file_path = (
    Path(DATA_PATH) / Path(TABLE_DATASET_FILE).parent /
    Path('test_results') / file_name
)
print("Output to:")
print(results_file_path)

results.to_csv(results_file_path)

Output to:
data/tables_OZ_geo_5500/test_results/Карты_мира_Озон_всего=48_top-5_Seller=ИНТЕРТРЕЙД_модель=siamese_contrastive_7k.pt_эмбеддинги=final.csv


# Merge to original df

In [122]:
source_df[['SKU', 'URL']].head()

Unnamed: 0,SKU,URL
0,1871769771,https://www.ozon.ru/product/karty-mira-i-rossi...
1,1679550303,https://www.ozon.ru/product/shema-liniy-skoros...
2,1200553001,https://www.ozon.ru/product/politicheskaya-kar...
3,922231521,https://www.ozon.ru/product/politicheskaya-kar...
4,922230517,https://www.ozon.ru/product/politicheskaya-kar...


In [124]:
# Assume 'results' has columns: "SKU Query", "Top-1", "Top-2", ..., "Top-k"
k = 5

final_df = (
    source_df[['SKU', 'URL']]
    .rename(columns={'SKU': 'Query SKU', 'URL': 'Query URL'})
    .merge(results, on='Query SKU', how='right')
)

for i in range(1, k+1):
    sku_col = f"Top-{i} SKU"
    url_col = f"Top-{i} URL"
    final_df = final_df.merge(
        source_df[['SKU', 'URL']].rename(columns={'SKU': sku_col, 'URL': url_col}),
        on=sku_col,
        how='left'
    )

cols_order = [
    "Query SKU",
    "Top-1 SKU", "Top-2 SKU", "Top-3 SKU", "Top-4 SKU", "Top-5 SKU",
    "Top-1 Score", "Top-2 Score", "Top-3 Score", "Top-4 Score", "Top-5 Score",
    "Query URL",
    "Top-1 URL", "Top-2 URL", "Top-3 URL", "Top-4 URL", "Top-5 URL"
]
final_df = final_df[cols_order]

final_df.head()

Unnamed: 0,Query SKU,Top-1 SKU,Top-2 SKU,Top-3 SKU,Top-4 SKU,Top-5 SKU,Top-1 Score,Top-2 Score,Top-3 Score,Top-4 Score,Top-5 Score,Query URL,Top-1 URL,Top-2 URL,Top-3 URL,Top-4 URL,Top-5 URL
0,1871769771,1947327095,1947160810,1953623209,1949798600,1957134593,0.90107,0.505567,0.494488,0.480659,0.470711,https://www.ozon.ru/product/karty-mira-i-rossi...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/nastennaya-karta-a...
1,1679550303,1949798600,1953623209,1941834406,1957134593,1947160810,0.798799,0.758761,0.75816,0.741988,0.734876,https://www.ozon.ru/product/shema-liniy-skoros...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/roskartografiya-is...,https://www.ozon.ru/product/nastennaya-karta-a...,https://www.ozon.ru/product/interaktivnaya-kar...
2,1200553001,1947327095,1953623209,1949798600,1947160810,1941834406,0.925983,0.421462,0.41819,0.401565,0.349902,https://www.ozon.ru/product/politicheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/roskartografiya-is...
3,922231521,1947327095,1953623209,1949798600,1947160810,1941834406,0.938452,0.665332,0.663334,0.650396,0.620969,https://www.ozon.ru/product/politicheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/roskartografiya-is...
4,922230517,1947327095,1949798600,1953623209,1947160810,1941834406,0.903337,0.537118,0.536721,0.503074,0.463418,https://www.ozon.ru/product/politicheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/roskartografiya-is...


In [125]:
# import re
# url_cols = [col for col in final_df.columns if re.search('URL', col)]
# final_df[url_cols]

In [141]:
final_df_file_path

PosixPath('data/tables_OZ_geo_5500/test_results/Карты_мира_Озон_всего=48_top-5_Seller=ИНТЕРТРЕЙД_модель=siamese_contrastive_7k.pt_эмбеддинги=final_URL-included.csv')

In [142]:
final_df_file_path = (
    Path(results_file_path).parent /
    Path('test_results') /
    (Path(results_file_path).stem + '_URL-included.csv')
)
final_df_file_path.parent.mkdir(parents=True, exist_ok=True)
print(final_df_file_path)

final_df.to_csv(final_df_file_path)

data/tables_OZ_geo_5500/test_results/Карты_мира_Озон_всего=48_top-5_Seller=ИНТЕРТРЕЙД_модель=siamese_contrastive_7k.pt_эмбеддинги=final_URL-included.csv


In [113]:
final_df.head()

Unnamed: 0,Query SKU,Top-1 SKU,Top-2 SKU,Top-3 SKU,Top-4 SKU,Top-5 SKU,Top-1 Score,Top-2 Score,Top-3 Score,Top-4 Score,Top-5 Score,Query URL,Top-1 URL,Top-2 URL,Top-3 URL,Top-4 URL,Top-5 URL
0,1871769771,1947327095,1947160810,1953623209,1949798600,1957134593,0.90107,0.505567,0.494488,0.480659,0.470711,https://www.ozon.ru/product/karty-mira-i-rossi...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/nastennaya-karta-a...
1,1679550303,1949798600,1953623209,1941834406,1957134593,1947160810,0.798799,0.758761,0.75816,0.741988,0.734876,https://www.ozon.ru/product/shema-liniy-skoros...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/roskartografiya-is...,https://www.ozon.ru/product/nastennaya-karta-a...,https://www.ozon.ru/product/interaktivnaya-kar...
2,1200553001,1947327095,1953623209,1949798600,1947160810,1941834406,0.925983,0.421462,0.41819,0.401565,0.349902,https://www.ozon.ru/product/politicheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/roskartografiya-is...
3,922231521,1947327095,1953623209,1949798600,1947160810,1941834406,0.938452,0.665332,0.663334,0.650396,0.620969,https://www.ozon.ru/product/politicheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/roskartografiya-is...
4,922230517,1947327095,1949798600,1953623209,1947160810,1941834406,0.903337,0.537118,0.536721,0.503074,0.463418,https://www.ozon.ru/product/politicheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/uchebnaya-topograf...,https://www.ozon.ru/product/istoricheskaya-kar...,https://www.ozon.ru/product/interaktivnaya-kar...,https://www.ozon.ru/product/roskartografiya-is...
