# Parameters

In [1]:
DATA_PATH = 'data'

IMG_DATASET_NAME = 'images_OZ_geo_5500'
TABLE_DATASET_DIR = 'tables_OZ_geo_5500'
TABLE_DATASET_FILE = 'tables_OZ_geo_5500/OZ_geo_5500.csv'
TABLE_DATASET_FILES= [
    'Ozon_Crawler_Latest_info2025-04-07-12-57-51.xlsx',
    'Карты мира_озон.xlsx'
]

In [2]:
QUERY_SELLER = 'ИНТЕРТРЕЙД'

# SUBSET_QUERY_SKU = 2
# SUBSET_NONQUERY_SKU = 6

SUBSET_QUERY_SKU = None
SUBSET_NONQUERY_SKU = None

TOP_K = 50

In [3]:
import torch

CLIP_MODEL = 'ruclip-vit-base-patch32-384'

# SBERT_BATCH_SIZE = 768 if torch.cuda.is_available() else 8
SBERT_BATCH_SIZE = 512 if torch.cuda.is_available() else 8 # lesser for larger TOP_K

RUCLIP_BATCH_SIZE = 512 if torch.cuda.is_available() else 8

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Imports

In [4]:
import pandas as pd

import requests
import os

import joblib
import xgboost as xgb
from datetime import date, timedelta
import numpy as np

import torch
from sentence_transformers import SentenceTransformer, util
from typing import List, Tuple
from PIL import Image
from io import BytesIO
import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

# import optuna

from pathlib import Path

In [5]:
try:
    import ruclip
except ModuleNotFoundError:
    !pip install git+https://github.com/tony-pitchblack/ru-clip.git#egg=ruclip
    import ruclip

In [6]:
import ruclip

clip, processor = ruclip.load(CLIP_MODEL, device=DEVICE)
sbert = SentenceTransformer('all-distilroberta-v1', device=DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

# Download data

In [7]:
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [8]:
# Use tokens from .env

import os
from dotenv import load_dotenv

import huggingface_hub

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
huggingface_hub.login(token=HF_TOKEN)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [9]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"
LOCAL_DIR = Path("data/train_results")
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        # "train_results/siamese_fitted*.pt",
        *[str(Path(TABLE_DATASET_DIR) / file_name) for file_name in TABLE_DATASET_FILES],
        f"{IMG_DATASET_NAME}.zip"
    ],
)

!unzip -o -q data/{IMG_DATASET_NAME}.zip -d data/

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

# Prepare data

In [10]:
DATA_PATH = 'data'

file_path = (
    Path(DATA_PATH) /
    Path('tables_OZ_geo_5500') /
    'Ozon_Crawler_Latest_info2025-04-07-12-57-51.xlsx'
)

descr_source_df = pd.read_excel(file_path)
descr_source_df.columns.tolist()

['Sku (Sku)',
 'Фото (CoverImage)',
 'Название товара (ProductName)',
 'Продавец (SellerName)',
 'Бренд (BrandName)',
 'Название категории (CategoryName)',
 'Цена соинвест (DiscountPrice)',
 'Цена по карте (OzonCardPrice)',
 'Сток FBO (StockFbm)',
 'Сток FBS (StockFbs)',
 'Ошибка загрузки (CrawlerError)',
 'Валюта (Currency)',
 'Цена до скидки (BasePrice)',
 'Рейтинг товара (AvgRating)',
 'Количество отзывов (Reviews)',
 'Описание (Description)',
 'Rich-контент (RichContent)',
 'Ссылка на карточку',
 'SellerProductId (SellerProductId)']

In [11]:
import re

# Function to extract Latin name and convert to snake_case
def extract_and_convert(col_name):
    match = re.search(r'\(([^)]+)\)', col_name)
    if match:
        camel = match.group(1)
    else:
        camel = col_name
    # Convert CamelCase to snake_case
    snake = re.sub(r'(?<!^)(?=[A-Z])', '_', camel).lower()
    return snake

# Apply renaming
descr_source_df.rename(columns={col: extract_and_convert(col) for col in descr_source_df.columns}, inplace=True)

# Check the result
print("Renamed columns:")
descr_source_df.columns.tolist()

Renamed columns:


['sku',
 'cover_image',
 'product_name',
 'seller_name',
 'brand_name',
 'category_name',
 'discount_price',
 'ozon_card_price',
 'stock_fbm',
 'stock_fbs',
 'crawler_error',
 'currency',
 'base_price',
 'avg_rating',
 'reviews',
 'description',
 'rich_content',
 'ссылка на карточку',
 'seller_product_id']

In [12]:
DATA_PATH = 'data'

file_path = (
    Path(DATA_PATH) /
    Path('tables_OZ_geo_5500') /
    'Карты мира_озон.xlsx'
)

source_df = pd.read_excel(file_path)
source_df.columns.tolist()

['SKU',
 'Name',
 'Category',
 'Схема',
 'Brand',
 'Niche',
 'Seller',
 'Balance',
 'Balance FBS',
 'Warehouses count',
 'Comments',
 'Final price',
 'Max price',
 'Min price',
 'Average price',
 'Median price',
 'Цена с Ozon картой',
 'Sales',
 'Revenue',
 'Revenue potential',
 'Revenue average',
 'Lost profit',
 'Lost profit percent',
 'URL',
 'Thumb',
 'Pics Count',
 'Has Video',
 'First Date',
 'Days in website',
 'Days in stock',
 'Days with sales',
 'Average if in stock',
 'Rating',
 'FBS',
 'Base price',
 'Category Position',
 'Categories Last Count',
 'Sales Per Day Average',
 'Turnover',
 'Frozen stocks',
 'Frozen stocks cost',
 'Frozen stocks percent']

In [13]:
all_required_cols = [
    'balance_first',
    'sales_first',
    'rating_first',
    'final_price_first',
    'comments_first',
    'description_first',
    'name_first',
    'options_first',
    'sku_first',
    'has_video_first',
    'photo_count_first',

    'balance_second', # Balance
    'sales_second',
    'rating_second', # AvgRating
    'final_price_second', # DiscountPrice,
    'comments_second', # Reviews
    'description_second',
    'name_second', # ProductName
    'options_second',
    'sku_second',
    'has_video_second',
    'photo_count_second',

    # 'image_url_first',
    # 'image_url_second',

    'iseq_vendor', # 0
    'iseq_color', # 0
    'iseq_brand', # BrandName
    'iseq_supp', # 0
    'are_related', # 0

    'desc_sim',
    'opt_sim',
    'name_sim',
    'img_sim',

    'label'
]

In [14]:
new_source_df_all = source_df.rename(
    columns={
        col: col.lower().replace(" ", "_")
        for col in source_df.columns
    }
)

required_cols = [
    'balance',
    'sales',
    'final_price',
    'rating',
    'comments',
    # 'description',
    'name',
    # 'options'
    'sku',
    'has_video',
    'pics_count',
    'seller',
    'url'
]

new_source_df_all = (
    new_source_df_all[required_cols]
    .rename(columns={'pics_count': 'photo_count'})
)

new_source_df_all.head(1)

Unnamed: 0,balance,sales,final_price,rating,comments,name,sku,has_video,photo_count,seller,url
0,346,156,1811,4.8,5227,Карта мира географическая политическая интерак...,936454663,0,4,GooDaY,https://www.ozon.ru/context/detail/id/936454663/


In [15]:
# Extract image id from URL

descr_source_df['image_id'] = descr_source_df['cover_image'].dropna().apply(
    lambda s: re.search(r'/(\d+)\.jpg$', str(s)).group(1)
)

descr_source_df.dropna(subset='image_id', inplace=True)
descr_source_df[['image_id', 'sku']]

Unnamed: 0,image_id,sku
0,7323783851,1871769771
1,7394308097,1679550303
2,7299023048,1200553001
3,7388534766,922231521
4,7295079927,922230517
...,...,...
5560,6008538837,166584090
5561,6008438667,166451882
5562,7439544697,154409524
5563,7098349497,147896031


In [16]:
new_source_df_all = new_source_df_all.merge(
    descr_source_df[['sku', 'description', 'image_id']],
    on='sku'
)

new_source_df_all['options'] = new_source_df_all['name']
new_source_df_all.columns.tolist()

['balance',
 'sales',
 'final_price',
 'rating',
 'comments',
 'name',
 'sku',
 'has_video',
 'photo_count',
 'seller',
 'url',
 'description',
 'image_id',
 'options']

In [17]:
new_source_df_all['description'] = (
    new_source_df_all['description']
    .fillna(new_source_df_all['name'])
)

In [18]:
# Take a subset: all query sku and some non-query sku

query_df = new_source_df_all[new_source_df_all.seller == QUERY_SELLER]
if SUBSET_QUERY_SKU is not None:
    query_df = query_df.sample(n=SUBSET_QUERY_SKU)

nonquery_df = new_source_df_all[~(new_source_df_all.seller == QUERY_SELLER)]
if SUBSET_NONQUERY_SKU is not None:
    nonquery_df = nonquery_df.sample(n=SUBSET_NONQUERY_SKU)

new_source_df = pd.concat([
    query_df,
    nonquery_df
]).reset_index(drop=True)

len(new_source_df), len(new_source_df_all)

(5562, 5562)

# Find top-k embeddings

In [19]:
from typing import Tuple, List, Any
from PIL import Image  # Assumes Pillow is installed

def get_sku_image_offline(sku_or_image_id, img_dataset_dir):
        """Load an image from disk for a given SKU.
           It first tries .jpg then .webp.
        """
        for ext in ['.jpg', '.webp']:
            img_path = os.path.join(img_dataset_dir, f"{sku_or_image_id}{ext}")
            if os.path.exists(img_path):
                try:
                    with open(img_path, 'rb') as f:
                        img_data = f.read()
                    image = Image.open(BytesIO(img_data))
                    image.load()
                    return image
                except Exception as e:
                    print(f"Error loading {img_path}: {e}")
        return None

def get_image_and_name(
    row,
    image_id_col: str,
    name_col: str,
    offline: bool = True,
    img_dataset_dir: str = '../data/images_7k'
) -> Tuple[Any, Any]:
    """
    Retrieves a single image and its corresponding name from a DataFrame row.

    Args:
        row: The DataFrame row.
        image_id_col: Column name for the image identifier.
        name_col: Column name for the product name.
        offline: Whether to use the offline image retrieval function.
        img_dataset_dir: Directory to load images from when offline.

    Returns:
        A tuple (image, name). If the image is not loaded, image will be None.
    """
    if offline:
        image = get_sku_image_offline(int(row[image_id_col]), img_dataset_dir)
    else:
        image = get_sku_image(int(row[image_id_col]))
    name = row[name_col]
    return image, name

def get_images_names(
    df,
    image_id_col: str = 'sku',
    name_col: str = 'name',
    offline: bool = True,
    img_dataset_dir: str = '../data/images_7k'
) -> Tuple[List[Image.Image], List[Any], List[int]]:
    """
    Iterates over the DataFrame and retrieves the image and name for each single product.

    Args:
        df: DataFrame containing one product per row.
        image_id_col: Column name for the image identifier.
        name_col: Column name for the product name.
        offline: Whether to load images using the offline function.
        img_dataset_dir: Directory for offline images.

    Returns:
        A tuple (images, names, problems) where:
         - images: list of loaded images,
         - names: list of corresponding names,
         - problems: list of row indices where the image failed to load.
    """
    images, names, problems = [], [], []
    for idx, row in df.iterrows():
        img, prod_name = get_image_and_name(row, image_id_col, name_col, offline, img_dataset_dir)
        if img is not None:
            images.append(img)
            names.append(prod_name)
        else:
            problems.append(idx)
    return images, names, problems

In [20]:
# Paths for caching embeddings
from pathlib import Path

emb_prefix = Path(DATA_PATH) / 'embeddings_OZ_geo_5500'
emb_prefix.mkdir(parents=True, exist_ok=True)  # Ensure directory exists

n_query = query_df.sku.nunique()
n_nonquery = nonquery_df.sku.nunique()

images_embs_file_name = f'{CLIP_MODEL}_images_latents_query-{n_query}_nonquery-{n_nonquery}.npy'
names_embs_file_name = f'{CLIP_MODEL}_names_latents_query-{n_query}_nquery-{n_nonquery}.npy'

In [21]:
# Compute embeddings if not cached
from tqdm import tqdm

if not os.path.isfile(emb_prefix / images_embs_file_name) \
or not os.path.isfile(emb_prefix / names_embs_file_name):
    templates = ['{}', 'это {}', 'на картинке {}', 'товар {}']
    predictor = ruclip.Predictor(
        clip, processor, DEVICE,
        bs = RUCLIP_BATCH_SIZE,
        templates=templates
    )

    images_latents = []
    names_latents = []
    problems_ids = []

    def get_batches(df, batch_size):
        for start in range(0, len(df), batch_size):
            yield df.iloc[start:start+batch_size]

    # Example usage in your loop:
    total_batches = len(new_source_df) // RUCLIP_BATCH_SIZE
    with torch.no_grad():
        for batch_idx, df_batch in tqdm(enumerate(get_batches(new_source_df, batch_size=RUCLIP_BATCH_SIZE))):
            print(f'\nBatch {batch_idx+1} / {total_batches}')
            images_batch, names_batch, problems_ids_batch = get_images_names(
                df=df_batch,
                image_id_col='image_id',
                name_col='name',
                img_dataset_dir='data/images_OZ_geo_5500',
                offline=True
            )
            # Process your batch here:
            images_latents_batch = predictor.get_image_latents(images_batch).detach().cpu()
            name_latents_batch = predictor.get_text_latents(names_batch).detach().cpu()

            images_latents.append(images_latents_batch)
            names_latents.append(name_latents_batch)

            problems_ids.extend(problems_ids_batch)

            # break  # Debug: remove break when ready

    images_latents = torch.cat(images_latents).numpy()
    names_latents = torch.cat(names_latents).numpy()

    # Save image latents
    np.save(emb_prefix / images_embs_file_name, images_latents)

    # Save name latents
    np.save(emb_prefix / names_embs_file_name, names_latents)
else:
    # Load cached embeddings
    images_latents = np.load(emb_prefix / images_embs_file_name)
    names_latents = np.load(emb_prefix / names_embs_file_name)
    problems_ids = []
    print("Loaded embeddings from cache.")

Loaded embeddings from cache.


# Find Top-k similar

In [22]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_top_k_similar(query_embeddings, embedding_matrix, k=5, metric='cosine', exclude_indices=None):
    """
    Find top-k similar items for each query embedding in a batch, with the option to exclude certain indices.
    If k is None, returns all indices sorted by similarity.

    Args:
        query_embeddings (np.ndarray): Array of query embeddings, shape (batch, D).
        embedding_matrix (np.ndarray): Array of all embeddings, shape (N, D).
        k (int or None): Number of top matches to return, or None to return all sorted candidates.
        metric (str): 'cosine' or 'euclidean'.
        exclude_indices (list, np.ndarray, or boolean mask, optional): Indices to exclude from search.
            If a list/array of indices is provided, it will be converted to a boolean mask.
            If a boolean mask is provided, it must have shape (N,).

    Returns:
        top_k (np.ndarray): Indices of similar embeddings for each query, shape (batch, M) where
                            M == k (or M == number of valid candidates if k is None).
        scores (np.ndarray): Corresponding similarity scores (or negated distances) for each query,
                             shape (batch, M).
    """
    # Convert exclude_indices to a boolean mask if needed
    if exclude_indices is not None:
        if isinstance(exclude_indices, (list, np.ndarray)):
            exclude_indices = np.array(exclude_indices)
            if exclude_indices.dtype != bool:
                mask = np.zeros(embedding_matrix.shape[0], dtype=bool)
                mask[exclude_indices] = True
            else:
                mask = exclude_indices
        else:
            raise ValueError("exclude_indices must be a list, np.ndarray, or boolean mask.")
    else:
        mask = None

    if metric == 'cosine':
        # Compute cosine similarities for the entire batch (shape: (batch, N))
        similarities = cosine_similarity(query_embeddings, embedding_matrix)
        # Set similarities for excluded indices to -infinity so they are not selected.
        if mask is not None:
            similarities[:, mask] = -np.inf
        # Sort indices in descending order of similarity.
        sorted_idx = np.argsort(-similarities, axis=1)
        if k is None:
            top_k = sorted_idx
            scores = np.take_along_axis(similarities, top_k, axis=1)
        else:
            top_k = sorted_idx[:, :k]
            scores = np.take_along_axis(similarities, top_k, axis=1)
    elif metric == 'euclidean':
        # Compute Euclidean distances: shape (batch, N)
        distances = np.linalg.norm(query_embeddings[:, None, :] - embedding_matrix[None, :, :], axis=2)
        # Set distances for excluded indices to +infinity so they are not selected.
        if mask is not None:
            distances[:, mask] = np.inf
        # Sort indices in ascending order of distance.
        sorted_idx = np.argsort(distances, axis=1)
        if k is None:
            top_k = sorted_idx
            # Negate distances so that higher scores correspond to closer matches.
            scores = -np.take_along_axis(distances, top_k, axis=1)
        else:
            top_k = sorted_idx[:, :k]
            scores = -np.take_along_axis(distances, top_k, axis=1)
    else:
        raise ValueError("Unsupported metric: choose 'cosine' or 'euclidean'")

    return top_k, scores


In [23]:
# Load queries
max_emb_cnt = images_latents.shape[0]

truncated_df = new_source_df.iloc[:max_emb_cnt]
query_indices = truncated_df[
    truncated_df.sku.isin(query_df.sku)
].index.tolist()

# For a batch of query embeddings with shape (batch, 768)
query_images_embs = images_latents[query_indices]  # e.g., shape (3, 768)
query_names_embs = names_latents[query_indices]  # e.g., shape (3, 768)

query_images_embs.shape, images_latents.shape

((23, 512), (5562, 512))

In [24]:
# Find top-k matches for images
top_k_images, scores_images = find_top_k_similar(
    query_images_embs, images_latents,
    k=TOP_K,
    metric='cosine',
    exclude_indices=query_indices
)

print("Top-k image indices per query (shape):")
print(top_k_images.shape)

print("Corresponding similarity scores:")
print(scores_images)

Top-k image indices per query (shape):
(23, 50)
Corresponding similarity scores:
[[1.0000005  0.7869836  0.7757062  ... 0.7461632  0.7458228  0.74504507]
 [0.9015658  0.76537764 0.7652058  ... 0.7114787  0.711029   0.71067816]
 [0.9580755  0.775892   0.7652441  ... 0.7231992  0.7204189  0.7204189 ]
 ...
 [0.89673996 0.787511   0.77549374 ... 0.7280019  0.7278451  0.72766733]
 [0.942389   0.8095961  0.8068207  ... 0.7467177  0.7467177  0.7462376 ]
 [0.9250959  0.7372247  0.7372247  ... 0.70169175 0.70169175 0.7016615 ]]


In [25]:
# Find top-k matches for name
top_k_name, scores_name = find_top_k_similar(
    query_names_embs, names_latents,
    k=TOP_K,
    metric='cosine',
    exclude_indices=query_indices
)

print("Top-k image indices per query (shape):")
print(top_k_name.shape)

print("Corresponding similarity scores:")
print(scores_name)

Top-k image indices per query (shape):
(23, 50)
Corresponding similarity scores:
[[0.8884476  0.86875427 0.8675046  ... 0.81596476 0.81585777 0.81585777]
 [0.8748104  0.8432297  0.8377782  ... 0.77953565 0.77726555 0.77600586]
 [0.87612927 0.8745546  0.8695879  ... 0.8202366  0.8192959  0.8192959 ]
 ...
 [0.910686   0.9003049  0.9003049  ... 0.8269875  0.8236731  0.8236731 ]
 [0.8875911  0.8815263  0.88073075 ... 0.830874   0.830874   0.82847863]
 [0.90945125 0.9001359  0.8974946  ... 0.8304988  0.8296022  0.8296022 ]]


In [26]:
import numpy as np

def union_top_k_candidates(top_k_name, top_k_images, scores_name, scores_images, TOP_K_CANDIDATES=5):
    """
    Unites two top-k candidate lists (and their scores) while avoiding duplicates.
    For each query (row), candidates present in both name and image lists are merged,
    keeping the candidate's highest score. The merged list is then sorted in descending
    order by score and only the top k unique candidates are returned.

    Args:
        top_k_name (np.ndarray): Array of candidate IDs from the name modality, shape (batch_size, num_candidates)
        top_k_images (np.ndarray): Array of candidate IDs from the image modality, shape (batch_size, num_candidates)
        scores_name (np.ndarray): Array of scores for the name candidates, same shape as top_k_name.
        scores_images (np.ndarray): Array of scores for the image candidates, same shape as top_k_images.
        TOP_K_CANDIDATES (int): Number of top candidates to return after merging.

    Returns:
        unique_candidates (np.ndarray): Array of merged candidate IDs, shape (batch_size, TOP_K_CANDIDATES)
        unique_scores (np.ndarray): Array of merged candidate scores, shape (batch_size, TOP_K_CANDIDATES)
    """
    batch_size = top_k_name.shape[0]
    merged_candidates = []
    merged_scores = []

    for i in range(batch_size):
        # Concatenate candidates and scores from both modalities for the current query.
        candidates = np.concatenate([top_k_name[i], top_k_images[i]])
        scores = np.concatenate([scores_name[i], scores_images[i]])

        # Use a dict to store each candidate and its best (highest) score.
        cand_dict = {}
        for cand, score in zip(candidates, scores):
            if cand in cand_dict:
                if score > cand_dict[cand]:
                    cand_dict[cand] = score
            else:
                cand_dict[cand] = score

        # Sort the unique candidates by score in descending order.
        sorted_items = sorted(cand_dict.items(), key=lambda x: x[1], reverse=True)
        # Extract only the top-k candidates and scores.
        top_candidates = [item[0] for item in sorted_items][:TOP_K_CANDIDATES]
        top_scores = [item[1] for item in sorted_items][:TOP_K_CANDIDATES]

        # If there are less than TOP_K_CANDIDATES (for some reason), we can pad lists (or leave as is)
        # Here we assume every row has enough unique candidates.
        merged_candidates.append(top_candidates)
        merged_scores.append(top_scores)

    # Convert the list of lists to numpy arrays.
    unique_candidates = np.array(merged_candidates)
    unique_scores = np.array(merged_scores)

    return unique_candidates, unique_scores

In [27]:
# Use the function on your concatenated arrays.
# First, as in your code, you concatenate:
top_k_all = np.concatenate([top_k_name, top_k_images], axis=-1)
scores_all = np.concatenate([scores_name, scores_images], axis=-1)

# Instead of doing this directly, it might be clearer to keep the two modalities separate.
# Then call our function:
top_k_united_indices, scores_united = union_top_k_candidates(
    top_k_name, top_k_images, scores_name, scores_images,
    TOP_K_CANDIDATES=TOP_K
)

print("Merged top candidates shape:", top_k_united_indices.shape)
print("Merged scores shape:", scores_united.shape)
print(top_k_united_indices)
print(scores_united)

Merged top candidates shape: (23, 50)
Merged scores shape: (23, 50)
[[1861  968  236 ... 2883 5263 4873]
 [1861 3141 1603 ... 3378 2131   32]
 [1861 4853   73 ... 1498   34 2118]
 ...
 [ 415 3151 2945 ... 4157 3378 3969]
 [1861  962 3463 ... 1007 1178 2068]
 [1861  244 3071 ... 2537 1619  448]]
[[1.0000005  0.8884476  0.86875427 ... 0.8159648  0.81596476 0.81585777]
 [0.9015658  0.8748104  0.8432297  ... 0.7800399  0.77953565 0.77726555]
 [0.9580755  0.87612927 0.8745546  ... 0.8204999  0.8202366  0.8192959 ]
 ...
 [0.910686   0.9003049  0.9003049  ... 0.8274261  0.8269875  0.8236731 ]
 [0.942389   0.8875911  0.8815263  ... 0.830874   0.830874   0.830874  ]
 [0.9250959  0.90945125 0.9001359  ... 0.8306186  0.8304988  0.8296022 ]]


In [28]:
CAND_IDX = 1

display(truncated_df.loc[query_indices[CAND_IDX]][['name', 'url']])
truncated_df.iloc[top_k_united_indices[CAND_IDX]][['name', 'url']]

Unnamed: 0,1
name,Географическая карта России настенная 102х160 ...
url,https://www.ozon.ru/context/detail/id/491270369/


Unnamed: 0,name,url
1861,"Географическая карта 100 x 160 см, масштаб: 1...",https://www.ozon.ru/context/detail/id/1936547864/
3141,"Географическая карта России для детей ""Карта Н...",https://www.ozon.ru/context/detail/id/1604850472/
1603,Настенная карта России на английском языке с н...,https://www.ozon.ru/context/detail/id/601555675/
2567,"Карта России для детей Карта Нашей Родины , 10...",https://www.ozon.ru/context/detail/id/1446470826/
193,Физическая карта России настенная для детей 19...,https://www.ozon.ru/context/detail/id/1283826235/
4674,"Globen, Карта России для детей, Карта Нашей Ро...",https://www.ozon.ru/context/detail/id/1061703959/
2866,Настенная карта России для детей Карта нашей Р...,https://www.ozon.ru/context/detail/id/958863369/
785,"Интерактивная карта России для детей ""Карта На...",https://www.ozon.ru/context/detail/id/671211225/
62,Карта России. Новые границы. 156х101 см. Ламин...,https://www.ozon.ru/context/detail/id/178726257/
4466,настенная карта России (новые границы) 135 х 2...,https://www.ozon.ru/context/detail/id/536711168/


# Compute paired dataset

## Make pairs for query sku

In [29]:
# get pairs

import pandas as pd

def get_pairs(df, sku, ignore_sku_list=[]):
    """
    Given a target SKU, return a paired DataFrame where:
      - *_first columns correspond to the target SKU row.
      - *_second columns correspond to all other SKU rows.
      - Equality columns (iseq_vendor, iseq_color, iseq_brand, iseq_supp, are_related) are added (all set to 0).

    Parameters:
        sku (int or str): SKU identifier for the target row.

    Returns:
        pd.DataFrame: DataFrame with paired rows.
    """
    # Ensure new_source_df is available in the global scope
    # Select the target row and the remaining rows
    target_df = df[df['sku'] == sku]
    if target_df.empty:
        raise ValueError(f"SKU {sku} not found in new_source_df")
    rest_df = df[~(df['sku'].isin([sku] + ignore_sku_list))]

    # Create a cross join (cartesian product) between the target row and all others
    paired_df_all = pd.merge(
        target_df.assign(key=1),
        rest_df.assign(key=1),
        on='key',
        suffixes=('_first', '_second')
    ).drop('key', axis=1)

    # Add equality columns and set them all to 0
    eq_cols = ['iseq_vendor', 'iseq_color', 'iseq_brand', 'iseq_supp', 'are_related']
    for col in eq_cols:
        paired_df_all[col] = 0

    # Define desired final order of columns
    final_columns = [
        'balance_first', 'sales_first', 'rating_first', 'final_price_first',
        'comments_first', 'description_first', 'name_first', 'options_first',
        'sku_first', 'has_video_first', 'photo_count_first',

        'balance_second', 'sales_second', 'rating_second', 'final_price_second',
        'comments_second', 'description_second', 'name_second', 'options_second',
        'sku_second', 'has_video_second', 'photo_count_second',

        'iseq_vendor', 'iseq_color', 'iseq_brand', 'iseq_supp', 'are_related',

        'image_id_first', 'image_id_second',
        'url_first', 'url_second'
    ]

    paired_df_all = paired_df_all[final_columns]
    return paired_df_all

In [30]:
paired_df_all = pd.DataFrame()
for query_idx, top_k_idx in zip(query_indices, top_k_united_indices):
    paired_df = get_pairs(
        truncated_df.loc[top_k_idx.tolist() + [query_idx]], # TODO: fix this crime
        sku=truncated_df.loc[query_idx].sku,
        ignore_sku_list=query_df.sku.tolist(),
    )
    paired_df_all = pd.concat([paired_df_all, paired_df], ignore_index=True)
    # break

paired_df_all.shape

(1150, 31)

## Add embedding distances

In [31]:
# Compute description and option similarities

desc_first, opt_first = paired_df_all.description_first, paired_df_all.options_first
desc_second, opt_second = paired_df_all.description_second, paired_df_all.options_second

emb_first = sbert.encode(
    desc_first.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=SBERT_BATCH_SIZE
)
emb_second = sbert.encode(
    desc_second.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=SBERT_BATCH_SIZE
)
desc_sim = np.diag(util.cos_sim(emb_first, emb_second).cpu().numpy())

emb_first = sbert.encode(
    opt_first.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=SBERT_BATCH_SIZE
)
emb_second = sbert.encode(
    opt_second.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=SBERT_BATCH_SIZE
)
opt_sim = np.diag(util.cos_sim(emb_first, emb_second).cpu().numpy())

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [32]:
def get_images_names_paired(
    df,
    image_id_col_first: str = 'sku_first',
    image_id_col_second: str = 'sku_second',
    name_col_first: str = 'name_first',
    name_col_second: str = 'name_second',
    offline: bool = True,
    img_dataset_dir: str = '../data/images_7k'
) -> Tuple[List[Image.Image], List[object], List[int]]:
    """
    Iterate over the DataFrame and extract image pairs and their corresponding names.

    For each row, two images and their associated names are extracted using get_image_and_name.
    If both images are successfully loaded, they are added to the lists. Otherwise, the row index is recorded as a problem.

    Args:
        df: A DataFrame containing data rows.
        image_id_col_first: Column name for the first image identifier.
        image_id_col_second: Column name for the second image identifier.
        name_col_first: Column name for the first name.
        name_col_second: Column name for the second name.
        offline: Whether to load images offline.
        img_dataset_dir: Directory to load images from when offline.

    Returns:
        A tuple (images, names, problems) where:
            - images is a list containing both images from each successful row,
            - names is a list of the corresponding names,
            - problems is a list of row indices where one or both images failed to load.
    """
    images, names, problems = [], [], []

    for idx, row in df.iterrows():
        img1, name1 = get_image_and_name(row, image_id_col_first, name_col_first, offline, img_dataset_dir)
        img2, name2 = get_image_and_name(row, image_id_col_second, name_col_second, offline, img_dataset_dir)

        if img1 is not None and img2 is not None:
            images.extend([img1, img2])
            names.extend([name1, name2])
        else:
            problems.append(idx)

    return images, names, problems

In [33]:
# # Example usage:
# images, names, problems_ids = get_images_names_paired(
#     paired_df,
#     image_id_col_first='image_id_first',
#     image_id_col_second='image_id_second',
#     name_col_first='name_first',  # Adjust these column names if needed
#     name_col_second='name_second',
#     img_dataset_dir='data/images_OZ_geo_5500'
# )

# print(f'Images loaded: {len(images)}')
# print(f'Images not loaded: {len(problems_ids)}')

In [34]:
# Delete problematic ids
paired_df = paired_df[~paired_df.index.isin(problems_ids)]

desc_sim = np.delete(desc_sim, problems_ids)
opt_sim = np.delete(opt_sim, problems_ids)

In [35]:
name_sim = []
img_sim = []
for query_idx, top_k_idx in zip(query_indices, top_k_united_indices):
    first = names_latents[query_idx]
    for candidate_idx in top_k_idx:
        second = names_latents[candidate_idx]
        name_sim.append(
            util.cos_sim(first, second).cpu().numpy().squeeze()
        )
        img_sim.append(
            util.cos_sim(first, second).cpu().numpy().squeeze()
        )

print(len(name_sim))
print(len(img_sim))

scores = np.c_[desc_sim, opt_sim, name_sim, img_sim]

1150
1150


In [36]:
scores_df = pd.DataFrame(scores, columns=['desc_sim', 'opt_sim', 'name_sim', 'img_sim'])

final_df = pd.concat(
    [
        paired_df_all.drop(columns=scores_df.columns, errors='ignore'),
        scores_df
    ],
    axis=1
)
final_df.head(1)

Unnamed: 0,balance_first,sales_first,rating_first,final_price_first,comments_first,description_first,name_first,options_first,sku_first,has_video_first,...,iseq_supp,are_related,image_id_first,image_id_second,url_first,url_second,desc_sim,opt_sim,name_sim,img_sim
0,37,370,4.9,807,1719,Карта мира настенная — идеальный помощник для ...,"Карта МИРА настенная политическая,160х102 см, ...","Карта МИРА настенная политическая,160х102 см, ...",491279127,0,...,0,0,7295087132,7295087132,https://www.ozon.ru/context/detail/id/491279127/,https://www.ozon.ru/context/detail/id/1936547864/,1.0,0.587024,0.575828,0.575828


In [37]:
final_df.columns.tolist()

['balance_first',
 'sales_first',
 'rating_first',
 'final_price_first',
 'comments_first',
 'description_first',
 'name_first',
 'options_first',
 'sku_first',
 'has_video_first',
 'photo_count_first',
 'balance_second',
 'sales_second',
 'rating_second',
 'final_price_second',
 'comments_second',
 'description_second',
 'name_second',
 'options_second',
 'sku_second',
 'has_video_second',
 'photo_count_second',
 'iseq_vendor',
 'iseq_color',
 'iseq_brand',
 'iseq_supp',
 'are_related',
 'image_id_first',
 'image_id_second',
 'url_first',
 'url_second',
 'desc_sim',
 'opt_sim',
 'name_sim',
 'img_sim']

# Save all files to HF

In [38]:
len(paired_df_all.columns)
paired_df_all.columns.tolist()

['balance_first',
 'sales_first',
 'rating_first',
 'final_price_first',
 'comments_first',
 'description_first',
 'name_first',
 'options_first',
 'sku_first',
 'has_video_first',
 'photo_count_first',
 'balance_second',
 'sales_second',
 'rating_second',
 'final_price_second',
 'comments_second',
 'description_second',
 'name_second',
 'options_second',
 'sku_second',
 'has_video_second',
 'photo_count_second',
 'iseq_vendor',
 'iseq_color',
 'iseq_brand',
 'iseq_supp',
 'are_related',
 'image_id_first',
 'image_id_second',
 'url_first',
 'url_second']

In [39]:
print(len(final_df.columns))
final_df.columns.tolist()

35


['balance_first',
 'sales_first',
 'rating_first',
 'final_price_first',
 'comments_first',
 'description_first',
 'name_first',
 'options_first',
 'sku_first',
 'has_video_first',
 'photo_count_first',
 'balance_second',
 'sales_second',
 'rating_second',
 'final_price_second',
 'comments_second',
 'description_second',
 'name_second',
 'options_second',
 'sku_second',
 'has_video_second',
 'photo_count_second',
 'iseq_vendor',
 'iseq_color',
 'iseq_brand',
 'iseq_supp',
 'are_related',
 'image_id_first',
 'image_id_second',
 'url_first',
 'url_second',
 'desc_sim',
 'opt_sim',
 'name_sim',
 'img_sim']

In [40]:
from pathlib import Path

n_query = query_df.sku.nunique()
n_nonquery = nonquery_df.sku.nunique()

tables_prefix = Path(DATA_PATH) / 'tables_OZ_geo_5500'
tables_prefix.mkdir(parents=True, exist_ok=True)

# Paired data CSV
file_path_pairs = (
    tables_prefix /
    f'tabular_OZ_geo_5500_top-{TOP_K}_query-{n_query}_nonquery-{n_nonquery}_pairs.csv'
)

paired_df_all.to_csv(file_path_pairs, index=None)

In [41]:
from pathlib import Path

n_query = query_df.sku.nunique()
n_nonquery = nonquery_df.sku.nunique()

tables_prefix = Path(DATA_PATH) / 'tables_OZ_geo_5500'
tables_prefix.mkdir(parents=True, exist_ok=True)

# Embedded CSV
file_path_embedded = (
    tables_prefix /
    f'tabular_OZ_geo_5500_top-{TOP_K}_query-{n_query}_nonquery-{n_nonquery}_embedded.csv'
)

final_df.to_csv(file_path_embedded, index=None)

In [42]:
from huggingface_hub import HfApi, login

api = HfApi()
api.upload_folder(
    folder_path=DATA_PATH,  # Path to the local directory
    repo_id="INDEEPA/clip-siamese",
    repo_type="dataset",
    ignore_patterns=['**/*.jpg', "**/*.webp"]
)

CommitInfo(commit_url='https://huggingface.co/datasets/INDEEPA/clip-siamese/commit/9d55687fd4b6127e28c61192dd4fcb1f286c2794', commit_message='Upload folder using huggingface_hub', commit_description='', oid='9d55687fd4b6127e28c61192dd4fcb1f286c2794', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/INDEEPA/clip-siamese', endpoint='https://huggingface.co', repo_type='dataset', repo_id='INDEEPA/clip-siamese'), pr_revision=None, pr_num=None)