# Imports

In [None]:
import pandas as pd

import requests
import os

import joblib
import xgboost as xgb
from datetime import date, timedelta
import numpy as np

import torch
from sentence_transformers import SentenceTransformer, util
from typing import List, Tuple
from PIL import Image
from io import BytesIO
import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

# import optuna

from pathlib import Path

In [None]:
try:
    import ruclip
except ModuleNotFoundError:
    !pip install git+https://github.com/tony-pitchblack/ru-clip.git#egg=ruclip
    import ruclip

In [None]:
import ruclip

clip, processor = ruclip.load('ruclip-vit-base-patch32-384')
sbert = SentenceTransformer('all-distilroberta-v1')

# Parameters

In [None]:
IMG_DATASET_NAME = 'images_OZ_geo_5500'
TABLE_DATASET_DIR = 'tables_OZ_geo_5500'
TABLE_DATASET_FILES= [
    'Ozon_Crawler_Latest_info2025-04-07-12-57-51.xlsx',
    'Карты мира_озон.xlsx'
]

SUBSET_SIZE = None
# SUBSET_SIZE = 1

DATA_PATH = 'data'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Download data

In [None]:
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [None]:
# Use tokens from .env

import os
from dotenv import load_dotenv

import huggingface_hub
import wandb

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
huggingface_hub.login(token=HF_TOKEN)


In [None]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"
LOCAL_DIR = Path("data/train_results")
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        # "train_results/siamese_fitted*.pt",
        *[str(Path(TABLE_DATASET_DIR) / file_name) for file_name in TABLE_DATASET_FILES],
        f"{IMG_DATASET_NAME}.zip"
    ],
)

!unzip -n -q data/{IMG_DATASET_NAME}.zip -d data/

# Prepare data

In [None]:
DATA_PATH = 'data'

file_path = (
    Path(DATA_PATH) /
    Path('tables_OZ_geo_5500') /
    'Ozon_Crawler_Latest_info2025-04-07-12-57-51.xlsx'
)

descr_source_df = pd.read_excel(file_path)
descr_source_df.columns.tolist()

In [None]:
import re

# Function to extract Latin name and convert to snake_case
def extract_and_convert(col_name):
    match = re.search(r'\(([^)]+)\)', col_name)
    if match:
        camel = match.group(1)
    else:
        camel = col_name
    # Convert CamelCase to snake_case
    snake = re.sub(r'(?<!^)(?=[A-Z])', '_', camel).lower()
    return snake

# Apply renaming
descr_source_df.rename(columns={col: extract_and_convert(col) for col in descr_source_df.columns}, inplace=True)

# Check the result
print("Renamed columns:")
descr_source_df.columns.tolist()

In [None]:
DATA_PATH = 'data'

file_path = (
    Path(DATA_PATH) /
    Path('tables_OZ_geo_5500') /
    'Карты мира_озон.xlsx'
)

source_df = pd.read_excel(file_path)
source_df.columns.tolist()

In [None]:
all_required_cols = [
    'balance_first',
    'sales_first',
    'rating_first',
    'final_price_first',
    'comments_first',
    'description_first',
    'name_first',
    'options_first',
    'sku_first',
    'has_video_first',
    'photo_count_first',

    'balance_second', # Balance
    'sales_second',
    'rating_second', # AvgRating
    'final_price_second', # DiscountPrice,
    'comments_second', # Reviews
    'description_second',
    'name_second', # ProductName
    'options_second',
    'sku_second',
    'has_video_second',
    'photo_count_second',

    # 'image_url_first',
    # 'image_url_second',

    'iseq_vendor', # 0
    'iseq_color', # 0
    'iseq_brand', # BrandName
    'iseq_supp', # 0
    'are_related', # 0

    'desc_sim',
    'opt_sim',
    'name_sim',
    'img_sim',

    'label'
]

In [None]:
new_source_df = source_df.rename(
    columns={
        col: col.lower().replace(" ", "_")
        for col in source_df.columns
    }
)

required_cols = [
    'balance',
    'sales',
    'final_price',
    'rating',
    'comments',
    # 'description',
    'name',
    # 'options'
    'sku',
    'has_video',
    'pics_count'
]

new_source_df = (
    new_source_df[required_cols]
    .rename(columns={'pics_count': 'photo_count'})
)

new_source_df.head(1)

In [None]:
# Extract image id from URL

descr_source_df['image_id'] = descr_source_df['cover_image'].dropna().apply(
    lambda s: re.search(r'/(\d+)\.jpg$', str(s)).group(1)
)

descr_source_df.dropna(subset='image_id', inplace=True)
descr_source_df[['image_id', 'sku']]

In [None]:
new_source_df = new_source_df.merge(
    descr_source_df[['sku', 'description', 'image_id']],
    on='sku'
)

new_source_df['options'] = new_source_df['name']
new_source_df.columns.tolist()

In [None]:
new_source_df['description'] = (
    new_source_df['description']
    .fillna(new_source_df['name'])
)

In [None]:
import pandas as pd

def get_pairs(sku):
    """
    Given a target SKU, return a paired DataFrame where:
      - *_first columns correspond to the target SKU row.
      - *_second columns correspond to all other SKU rows.
      - Equality columns (iseq_vendor, iseq_color, iseq_brand, iseq_supp, are_related) are added (all set to 0).

    Parameters:
        sku (int or str): SKU identifier for the target row.

    Returns:
        pd.DataFrame: DataFrame with paired rows.
    """
    # Ensure new_source_df is available in the global scope
    global new_source_df

    # Select the target row and the remaining rows
    target_df = new_source_df[new_source_df['sku'] == sku]
    if target_df.empty:
        raise ValueError(f"SKU {sku} not found in new_source_df")
    rest_df = new_source_df[new_source_df['sku'] != sku]

    # Create a cross join (cartesian product) between the target row and all others
    paired_df_all = pd.merge(
        target_df.assign(key=1),
        rest_df.assign(key=1),
        on='key',
        suffixes=('_first', '_second')
    ).drop('key', axis=1)

    # Add equality columns and set them all to 0
    eq_cols = ['iseq_vendor', 'iseq_color', 'iseq_brand', 'iseq_supp', 'are_related']
    for col in eq_cols:
        paired_df_all[col] = 0

    # Define desired final order of columns
    final_columns = [
        'balance_first', 'sales_first', 'rating_first', 'final_price_first',
        'comments_first', 'description_first', 'name_first', 'options_first',
        'sku_first', 'has_video_first', 'photo_count_first',

        'balance_second', 'sales_second', 'rating_second', 'final_price_second',
        'comments_second', 'description_second', 'name_second', 'options_second',
        'sku_second', 'has_video_second', 'photo_count_second',

        'iseq_vendor', 'iseq_color', 'iseq_brand', 'iseq_supp', 'are_related',

        'image_id_first', 'image_id_second'
    ]

    paired_df_all = paired_df_all[final_columns]
    return paired_df_all

In [None]:
# Subset Query SKU
query_skus = source_df[source_df['seller'] == 'ИНТЕРТРЕЙД']['sku'].tolist()
len(query_skus)

In [None]:
paired_df_all = pd.DataFrame()
for sku in query_skus:
    paired_df = get_pairs(sku)
    paired_df.columns.tolist()
    paired_df_all = pd.concat([paired_df_all, paired_df], ignore_index=True)

paired_df_all.shape

# Add embedding distances

In [None]:
# # Take a subset

subset_size = len(paired_df_all) if SUBSET_SIZE is None else SUBSET_SIZE

paired_df = paired_df_all.sample(subset_size, random_state=42)
len(paired_df), len(paired_df_all)

In [None]:
# Compute description and option similarities

BATCH_SIZE = 768 if torch.cuda.is_available() else 8

desc_first, opt_first = paired_df.description_first, paired_df.options_first
desc_second, opt_second = paired_df.description_second, paired_df.options_second

emb_first = sbert.encode(
    desc_first.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=BATCH_SIZE
)
emb_second = sbert.encode(
    desc_second.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=BATCH_SIZE
)
desc_sim = np.diag(util.cos_sim(emb_first, emb_second).cpu().numpy())

emb_first = sbert.encode(
    opt_first.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=BATCH_SIZE
)
emb_second = sbert.encode(
    opt_second.tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=BATCH_SIZE
)
opt_sim = np.diag(util.cos_sim(emb_first, emb_second).cpu().numpy())

In [None]:
import os
from PIL import Image
from io import BytesIO

def get_sku_image_offline(
    sku_or_image_id,
    img_dataset_dir='data/images_7k'
):
    """
    Load an image for a given SKU from the dataset path.
    It tries .jpg first then .webp.

    Parameters:
        sku (int or str): The SKU number.
        img_dataset_dir (str): Directory path where images are stored.

    Returns:
        Image object if found and opened; otherwise, None.
    """
    for ext in ['.jpg', '.webp']:
        img_path = os.path.join(
            img_dataset_dir, f"{sku_or_image_id}{ext}"
        )

        if os.path.exists(img_path):
            try:
                with open(img_path, 'rb') as f:
                    img_data = f.read()
                image = Image.open(BytesIO(img_data))
                # Ensure the image loads completely
                image.load()
                return image
            except Exception as e:
                print(f"Error loading {img_path}: {e}")
    return None

# get_sku_image_offline(
#     paired_df['image_id_first'].sample(1).item(),
#     'data/images_OZ_geo_5500'
# )

In [None]:
# Load images & names

def get_images_names(
    df,
    image_id_col_first = 'sku_first',
    image_id_col_second = 'sku_second',
    img_dataset_dir='../data/images_7k',
    offline=True,
) -> Tuple[List[Image.Image], List[object]]:

    images, names, problems = list(), list(), list()
    for row in df.iterrows():
        row_num = row[0]
        row = row[1]

        if offline:
            img1 = get_sku_image_offline(int(row[image_id_col_first]), img_dataset_dir)
            img2 = get_sku_image_offline(int(row[image_id_col_second]), img_dataset_dir)
        else:
            img1 = get_sku_image(int(row[image_id_col_first]))
            img2 = get_sku_image(int(row[image_id_col_second]))

        name1, name2 = row.name_first, row.name_second
        if img1 is not None and img2 is not None:
            images.append(img1)
            images.append(img2)
            names.append(name1)
            names.append(name2)
        else:
            problems.append(row_num)
    images = images
    return images, names, problems

images, names, problems_ids = get_images_names(
    paired_df,
    image_id_col_first = 'image_id_first',
    image_id_col_second = 'image_id_second',
    img_dataset_dir='data/images_OZ_geo_5500'
)

print(f'Images loaded: {len(images)}')
print(f'Images not loaded: {len(problems_ids)}')

In [None]:
# Delete problematic ids
paired_df = paired_df[~paired_df.index.isin(problems_ids)]
desc_sim = np.delete(desc_sim, problems_ids)
opt_sim = np.delete(opt_sim, problems_ids)

In [None]:
classes = list(names)

templates = ['{}', 'это {}', 'на картинке {}', 'товар {}']

predictor = ruclip.Predictor(clip, processor, DEVICE, bs=8, templates=templates)
with torch.no_grad():
    text_latents = predictor.get_text_latents(classes)
    images_latents = predictor.get_image_latents(images)

name_sim = []
img_sim = []

for ind in range(0, text_latents.shape[0], 2):
    first = text_latents[ind]
    second = text_latents[ind + 1]
    name_sim.append(util.cos_sim(first, second).cpu().numpy().squeeze())

    first = images_latents[ind]
    second = images_latents[ind + 1]
    img_sim.append(util.cos_sim(first, second).cpu().numpy().squeeze())

print(len(name_sim))
print(len(img_sim))

scores = np.c_[desc_sim, opt_sim, name_sim, img_sim]

In [None]:
scores_df = pd.DataFrame(scores, columns=['desc_sim', 'opt_sim', 'name_sim', 'img_sim'])

final_df = pd.concat(
    [
        paired_df.drop(columns=scores_df.columns, errors='ignore'),
        scores_df
    ],
    axis=1
)
final_df.head(1)

In [None]:
final_df.columns.tolist()

In [None]:
file_path = (
    Path(DATA_PATH) /
    'tables_OZ_geo_5500' /
    'tabular_OZ_geo_5500.csv'
)

final_df.to_csv(file_path)