# Installs & tokens

In [1]:
%%capture
try:
    import mlflow
except ImportError:
    !pip install mlflow

In [2]:
%%capture
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [4]:
# Log into huggingface via Kaggle Secrets or .env

import os
from dotenv import load_dotenv
import huggingface_hub

try:
    from kaggle_secrets import UserSecretsClient

    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
except ModuleNotFoundError:
    print("Not Kaggle environment. Skipping Kaggle secrets.")
    print("Trying to load HF_TOKEN from .env.")
    load_dotenv()
    HF_TOKEN = os.getenv("HF_TOKEN")
    print("Success!")

huggingface_hub.login(token=HF_TOKEN)

Not Kaggle environment. Skipping Kaggle secrets.
Trying to load HF_TOKEN from .env.
Success!


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Choose notebook parameters

In [5]:
import torch

## CHOOSE MODEL PARAMETERS #################################################

MODEL_NAME_POSTFIX='splitting-by-query'

NAME_MODEL_NAME = 'cointegrated/rubert-tiny' # 'DeepPavlov/distilrubert-tiny-cased-conversational-v1'
DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny'

DATA_PATH = 'data/'
RESULTS_DIR = 'train_results/'

# BATCH_SIZE=60 # uses 14.5GiB of 1 GPU
# NUM_WORKERS=2 # TODO: use multiple GPU, tune number of workers
# NUM_DEBUG_SAMPLES=None
# EPOCHS=10 # epochs > 8 => overfit; NOTE: can train for longer since we take best validation checkpoint anyway

BATCH_SIZE=1
NUM_WORKERS=0
NUM_DEBUG_SAMPLES=2
EPOCHS=2

PRELOAD_MODEL_NAME = 'cc12m_rubert_tiny_ep_1.pt' # preload ruclip
# PRELOAD_MODEL_NAME = None

POS_WEIGHT = 4.0 # TODO: infer from data

# USE_ALL_TRAIN_PAIRS = False
# MAX_SAMPLES_PER_EPOCH = None

USE_ALL_TRAIN_PAIRS = True
MAX_SAMPLES_PER_EPOCH = 2_500
# MAX_SAMPLES_PER_EPOCH = 2_500 * 12

DROPOUT = 0.5
# DROPOUT = None

# BEST_CKPT_METRIC = 'f1'
BEST_CKPT_METRIC = 'pos_acc'

VALIDATION_SPLIT=.05
TEST_SPLIT=.1
RANDOM_SEED=42
LR=9e-5
MOMENTUM=0.9
WEIGHT_DECAY=1e-2
CONTRASTIVE_MARGIN=1.5
CONTRASTIVE_THRESHOLD=0.3
SHEDULER_PATIENCE=3 # in epochs

DEVICE='cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
## CHOOSE DATA #########################################################

DATA_PATH=  'data/'
SOURCE_TABLE_NAME = 'tables_OZ_geo_5500/processed/OZ_geo_5500.csv'

# --- Load source_df and pairwise_mapping_df from Parquet ---
SOURCE_TABLE_NAME = 'tables_OZ_geo_5500/processed/OZ_geo_5500.csv'
PAIRWISE_TABLE_NAME = 'tables_OZ_geo_5500/processed/regex-pairwise-groups/regex-pairwise-groups_num-queries=20_patterns-dict-hash=6dbf9b3ef9568e60cd959f87be7e3b26.parquet'
IMG_DATASET_NAME = 'images_OZ_geo_5500'

In [7]:
## LOGGING PARAMS ######################################################################

# MLFLOW_URI = "http://176.56.185.96:5000"
# MLFLOW_URI = "http://localhost:5000"
MLFLOW_URI = None

MLFLOW_EXPERIMENT = "siamese/1fold"

TELEGRAM_TOKEN = None
# TELEGRAM_TOKEN = '' # set token to get notifications

# Definitions

In [32]:
# Imports
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import mlflow
from mlflow.models import infer_signature

from timm import create_model
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
from torch import optim, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision import transforms
from torchinfo import summary
# import transformers
# from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer,\
#         get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer

import cv2

from PIL import Image
from tqdm.auto import tqdm

# import json
# from itertools import product

# import datasets
# from datasets import Dataset, concatenate_datasets
# import argparse
import requests

# from io import BytesIO
# from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
# import more_itertools

from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics import f1_score
import mlflow
from torch.optim.lr_scheduler import ReduceLROnPlateau
import matplotlib.pyplot as plt
import tempfile

In [9]:
def make_tg_report(text, token=None) -> None:
    method = 'sendMessage'
    chat_id = 324956476
    _ = requests.post(
            url='https://api.telegram.org/bot{0}/{1}'.format(token, method),
            data={'chat_id': chat_id, 'text': text} 
        ).json()

In [10]:
class RuCLIPtiny(nn.Module):
    def __init__(self, name_model_name):
        super().__init__()
        self.visual = create_model('convnext_tiny',
                                   pretrained=False, # TODO: берём претрейн
                                   num_classes=0,
                                   in_chans=3)  # out 768

        self.transformer = AutoModel.from_pretrained(name_model_name)
        name_model_output_shape = self.transformer.config.hidden_size  # dynamically get hidden size
        self.final_ln = torch.nn.Linear(name_model_output_shape, 768)  # now uses the transformer hidden size
        self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

    @property
    def dtype(self):
        return self.visual.stem[0].weight.dtype

    def encode_image(self, image):
        return self.visual(image.type(self.dtype))

    def encode_text(self, input_ids, attention_mask):
        x = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        x = x.last_hidden_state[:, 0, :]
        x = self.final_ln(x)
        return x

    def forward(self, image, input_ids, attention_mask):
        image_features = self.encode_image(image)
        text_features = self.encode_text(input_ids, attention_mask)

        # normalized features
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        return logits_per_image, logits_per_text

In [11]:
def get_transform():
    return transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        _convert_image_to_rgb,
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]), ])

def _convert_image_to_rgb(image):
    return image.convert("RGB")

class Tokenizers:
    def __init__(self):
        self.name_tokenizer = AutoTokenizer.from_pretrained(NAME_MODEL_NAME)
        self.desc_tokenizer = AutoTokenizer.from_pretrained(DESCRIPTION_MODEL_NAME)

    def tokenize_name(self, texts, max_len=77):
        tokenized = self.name_tokenizer.batch_encode_plus(texts,
                                                     truncation=True,
                                                     add_special_tokens=True,
                                                     max_length=max_len,
                                                     padding='max_length',
                                                     return_attention_mask=True,
                                                     return_tensors='pt')
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])
    
    def tokenize_description(self, texts, max_len=77):
        tokenized = self.desc_tokenizer(texts,
                                        truncation=True,
                                        add_special_tokens=True,
                                        max_length=max_len,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        return_tensors='pt')
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])

class SiameseRuCLIPDataset(torch.utils.data.Dataset):
    def __init__(self, df=None, labels=None, df_path=None, images_dir=DATA_PATH+'images/'):
        # loads data either from path using `df_path` or directly from `df` argument
        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.labels = labels
        self.images_dir = images_dir
        self.tokenizers = Tokenizers()
        self.transform = get_transform()
        # 
        self.max_len = 77
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        name_tokens = self.tokenizers.tokenize_name([str(row.name_first), 
                                               str(row.name_second)], max_len=self.max_len)
        name_first = name_tokens[:, 0, :] # [input_ids, attention_mask]
        name_second = name_tokens[:, 1, :]
        desc_tokens = self.tokenizers.tokenize_description([str(row.description_first), 
                                               str(row.description_second)])
        desc_first = desc_tokens[:, 0, :] # [input_ids, attention_mask]
        desc_second = desc_tokens[:, 1, :]
        im_first = cv2.imread(os.path.join(self.images_dir, row.image_name_first))
        im_first = cv2.cvtColor(im_first, cv2.COLOR_BGR2RGB)
        im_first = Image.fromarray(im_first)
        im_first = self.transform(im_first)
        im_second = cv2.imread(os.path.join(self.images_dir, row.image_name_second))
        im_second = cv2.cvtColor(im_second, cv2.COLOR_BGR2RGB)
        im_second = Image.fromarray(im_second)
        im_second = self.transform(im_second)
        label = self.labels[idx]
        return im_first, name_first, desc_first, im_second, name_second, desc_second, label

    def __len__(self,):
        return len(self.df)

In [12]:
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

class SiameseRuCLIP(nn.Module):
    def __init__(self,
                 device: str,
                 name_model_name: str,
                 description_model_name: str,
                 preload_model_name: str = None,
                 models_dir: str = None,
                 dropout: float = None):
        """
        Initializes the SiameseRuCLIP model.
        Required parameters:
          - models_dir: directory containing saved checkpoints.
          - name_model_name: model name for text (name) branch.
          - description_model_name: model name for description branch.
        """
        super().__init__()
        device = torch.device(device)

        # Initialize RuCLIPtiny
        self.ruclip = RuCLIPtiny(name_model_name)
        if preload_model_name is not None:
            std = torch.load(
                os.path.join(models_dir, preload_model_name),
                weights_only=True,
                map_location=device
            )
            self.ruclip.load_state_dict(std)
            self.ruclip.eval()
        self.ruclip = self.ruclip.to(device)

        # Initialize the description transformer
        self.description_transformer = AutoModel.from_pretrained(description_model_name)
        self.description_transformer = self.description_transformer.to(device)

        # Determine dimensionality
        vision_dim = self.ruclip.visual.num_features
        name_dim = self.ruclip.final_ln.out_features
        desc_dim = self.description_transformer.config.hidden_size
        self.hidden_dim = vision_dim + name_dim + desc_dim
        self.dropout = dropout

        # Define MLP head with optional dropout
        layers = [
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.ReLU(),
            *( [nn.Dropout(self.dropout)] if self.dropout is not None else [] ),
            nn.Linear(self.hidden_dim // 2, self.hidden_dim // 4),
        ]
        self.head = nn.Sequential(*layers).to(device)


    def encode_image(self, image):
        return self.ruclip.encode_image(image)

    def encode_name(self, name):
        return self.ruclip.encode_text(name[:, 0, :], name[:, 1, :])

    def encode_description(self, desc):
        last_hidden_states = self.description_transformer(desc[:, 0, :], desc[:, 1, :]).last_hidden_state
        attention_mask = desc[:, 1, :]
        return average_pool(last_hidden_states, attention_mask)

    def get_final_embedding(self, im, name, desc):
        image_emb = self.encode_image(im)
        name_emb = self.encode_name(name)
        desc_emb = self.encode_description(desc)

        # Concatenate the embeddings and forward through the head
        combined_emb = torch.cat([image_emb, name_emb, desc_emb], dim=1)
        final_embedding = self.head(combined_emb)
        return final_embedding

    def forward(self, im1, name1, desc1, im2, name2, desc2):
        out1 = self.get_final_embedding(im1, name1, desc1)
        out2 = self.get_final_embedding(im2, name2, desc2)
        return out1, out2

In [13]:
# # old
# class ContrastiveLoss(torch.nn.Module):
#     def __init__(self, margin=2.0):
#         super(ContrastiveLoss, self).__init__()
#         self.margin = margin
        
#     def __name__(self,):
#         return 'ContrastiveLoss'

#     def forward(self, output1, output2, label):
#         euclidean_distance = F.pairwise_distance(output1, output2)
#         pos = (1-label) * torch.pow(euclidean_distance, 2)
#         neg = label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2)
#         loss_contrastive = torch.mean( pos + neg )
#         return loss_contrastive

In [14]:
class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin: float = 1.5, pos_weight: float = 4.0):
        super().__init__()
        self.margin      = margin
        self.pos_weight  = pos_weight

    def forward(self, output1, output2, label):
        d   = F.pairwise_distance(output1, output2)
        pos = (1 - label) * d.pow(2)                            # duplicates (label==0)
        neg = label * F.relu(self.margin - d).pow(2)            # different (label==1)
        return (self.pos_weight * pos + neg).mean()

In [15]:
# TODO: plot epoch after each train epoch in `train()`

from pathlib import Path

def plot_epoch(loss_history, filename="data/runs_artifacts/epoch_loss.png") -> None:
    Path(filename).parent.mkdir(parents=True, exist_ok=True)
    display.clear_output(wait=True)
    plt.figure(figsize=(6, 4))
    plt.title("Training loss")
    plt.xlabel("Iteration number")
    plt.ylabel("Loss")
    plt.plot(loss_history, 'b')
    plt.tight_layout()
    plt.savefig(filename)  # Save the plot to a file
    plt.show()

In [16]:
def evaluate_pair(output1, output2, target, threshold):
    euclidean_distance = F.pairwise_distance(output1, output2)
    # меньше границы, там где будет True — конкуренты
    cond = euclidean_distance < threshold
    pos_sum = 0
    neg_sum = 0
    pos_acc = 0
    neg_acc = 0

    for i in range(len(cond)):
        # 1 значит не конкуренты
        if target[i]:
            neg_sum+=1
            # 0 в cond значит дальше друг от друга чем threshold
            if not cond[i]:
                neg_acc+=1
        elif not target[i]:
            pos_sum+=1
            if cond[i]:
                pos_acc+=1

    return pos_acc, pos_sum, neg_acc, neg_sum

def predict(out1, out2, threshold=CONTRASTIVE_THRESHOLD):
    # вернёт 1 если похожи
    return F.pairwise_distance(out1, out2) < threshold

In [17]:
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
import mlflow
from copy import deepcopy
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pathlib import Path

def validation(model,
               criterion,
               data_loader,
               epoch,
               device='cpu',
               split_name='validation',
               threshold=None,
               margin=1.5,
               steps=200,
               metric='f1'):
    """
    Runs one pass over `data_loader`, returning:
      pos_acc, neg_acc, avg_acc, f1, avg_loss, best_thr

    threshold sweep: if threshold is None, tests `steps` values in [0, margin]
    and picks the one that maximises either:
      - F1              if metric=='f1'
      - positive accuracy if metric=='pos_acc'
    """
    assert metric in ('f1', 'pos_acc'), "metric must be 'f1' or 'pos_acc'"

    model.eval()
    total_loss = 0.0
    all_d, all_lbl = [], []

    with torch.no_grad():
        for batch in data_loader:
            im1, n1, d1, im2, n2, d2, lbl = [t.to(device) for t in batch]
            out1, out2 = model(im1, n1, d1, im2, n2, d2)
            total_loss += criterion(out1, out2, lbl).item()
            all_d.append(F.pairwise_distance(out1, out2).cpu())
            all_lbl.append(lbl.cpu())

    distances = torch.cat(all_d)
    labels    = torch.cat(all_lbl)               # 0 = duplicate (positive), 1 = different (negative)
    avg_loss  = total_loss / len(data_loader)

    # === threshold sweep ===
    if threshold is None:
        grid = np.linspace(0.0, margin, steps)
        best_val, best_thr = -1.0, 0.0
        y_true = (labels.numpy() == 0).astype(int)   # 1 = positive
        for t in grid:
            y_pred = (distances.numpy() < t).astype(int)
            if metric == 'f1':
                val = f1_score(y_true, y_pred, zero_division=0)
            else:  # metric == 'pos_acc'
                # positive accuracy = TP / P
                pos_mask = (y_true == 1)
                val = (y_pred[pos_mask] == 1).mean() if pos_mask.sum() > 0 else 0.0
            if val > best_val:
                best_val, best_thr = val, t
        threshold = best_thr
    else:
        best_thr = threshold

    # === final metrics at chosen threshold ===
    preds    = (distances < threshold).long()
    pos_mask = (labels == 0)
    neg_mask = (labels == 1)

    pos_acc = (preds[pos_mask] == 1).float().mean().item() if pos_mask.any() else 0.0
    neg_acc = (preds[neg_mask] == 0).float().mean().item() if neg_mask.any() else 0.0
    avg_acc = (pos_acc + neg_acc) / 2.0
    f1      = f1_score((labels.numpy() == 0).astype(int),
                       preds.numpy(), zero_division=0)

    # log to Telegram / console
    report = (f"[{split_name}] Epoch {epoch} – "
              f"loss: {avg_loss:.4f}, "
              f"P Acc: {pos_acc:.3f}, "
              f"N Acc: {neg_acc:.3f}, "
              f"Avg Acc: {avg_acc:.3f}, "
              f"F1: {f1:.3f}, "
              f"thr*: {threshold:.3f} "
              f"(optimised: {metric})")
    print(report)
    make_tg_report(report, TELEGRAM_TOKEN)

    # log to MLflow under the chosen metric
    if MLFLOW_URI and split_name == 'validation':
        if metric == 'f1':
            mlflow.log_metric("valid_f1_score", f1, step=epoch)
        else:
            mlflow.log_metric("valid_pos_accuracy", pos_acc, step=epoch)

    return pos_acc, neg_acc, avg_acc, f1, avg_loss, threshold


In [18]:
from time import perf_counter
from datetime import timedelta

def train(model,
          optimizer,
          criterion,
          epochs_num,
          train_loader,
          valid_loader=None,
          device='cpu',
          print_epoch=False,
          models_dir=None,
          metric='f1'):
    """
    Trains for `epochs_num` epochs, using `validation(..., metric=metric)` each epoch.
    Uses the same `metric` to step the LR scheduler and to pick the best checkpoint.

    Returns:
      train_losses, val_losses, best_valid_metric, best_weights, thr_history
    """
    assert metric in ('f1', 'pos_acc'), "metric must be 'f1' or 'pos_acc'"

    model.to(device)
    train_losses, val_losses, thr_history = [], [], []
    best_valid_metric, best_threshold = float('-inf'), None
    best_weights = None

    scheduler = ReduceLROnPlateau(
        optimizer,
        mode="max",
        factor=0.1,
        patience=SHEDULER_PATIENCE,
        threshold=1e-4,
        threshold_mode='rel'
    )

    if models_dir:
        Path(models_dir).mkdir(parents=True, exist_ok=True)

    for epoch in range(1, epochs_num + 1):
        # ---- training ----
        model.train()
        total_train_loss = 0.0
        for batch in train_loader:
            im1, n1, d1, im2, n2, d2, lbl = [t.to(device) for t in batch]
            optimizer.zero_grad()
            out1, out2 = model(im1, n1, d1, im2, n2, d2)
            loss = criterion(out1, out2, lbl)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        train_losses.append(total_train_loss / len(train_loader))

        # ---- validation & checkpointing ----
        if print_epoch and valid_loader is not None:
            pos_acc, neg_acc, avg_acc, f1_val, val_loss, val_thr = validation(
                model,
                criterion,
                valid_loader,
                epoch,
                device=device,
                split_name='validation',
                threshold=None,
                margin=CONTRASTIVE_MARGIN,
                steps=200,
                metric=metric
            )
            val_losses.append(val_loss)
            thr_history.append(val_thr)

            # pick the metric value to step & compare
            cur_metric = pos_acc if metric == 'pos_acc' else f1_val
            scheduler.step(cur_metric)

            # save checkpoint every epoch if requested
            if models_dir:
                torch.save(model.state_dict(),
                           Path(models_dir) / f"checkpoint_epoch_{epoch}.pt")

            # update best if improved
            if cur_metric > best_valid_metric:
                best_valid_metric = cur_metric
                best_threshold     = val_thr
                best_weights       = deepcopy(model.state_dict())

        print(f'Epoch {epoch} done.')

    print(f"Best validation {metric}: {best_valid_metric:.3f}  (thr={best_threshold:.3f})")
    return train_losses, val_losses, best_valid_metric, best_weights, thr_history

# Prepare data

## Download data from HF

In [19]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"
LOCAL_DIR = Path("data/train_results")
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        "train_results/cc12m*.pt",
        SOURCE_TABLE_NAME, PAIRWISE_TABLE_NAME,
        f"{IMG_DATASET_NAME}.zip"
    ],
)

# The following shell command was removed for script compatibility:
# !unzip -n -q data/{IMG_DATASET_NAME}.zip -d data/

# If you need to unzip in Python, use:
# import zipfile
# with zipfile.ZipFile(f"data/{IMG_DATASET_NAME}.zip", 'r') as zip_ref:
#     zip_ref.extractall("data/")


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

'/home/anton/marketplace/clip-siamese/data'

In [24]:
source_df = pd.read_csv(DATA_PATH + SOURCE_TABLE_NAME)
source_df.columns.tolist()

['sku',
 'description',
 'image_url',
 'name',
 'category',
 'схема',
 'brand',
 'niche',
 'seller',
 'balance_fbo',
 'balance_fbs',
 'warehouses_count',
 'comments',
 'final_price',
 'max_price',
 'min_price',
 'average_price',
 'median_price',
 'membership_card_price',
 'sales',
 'revenue',
 'revenue_potential',
 'revenue_average',
 'lost_profit',
 'lost_profit_percent',
 'url',
 'thumb',
 'pics_count',
 'has_video',
 'first_date',
 'days_in_website',
 'days_in_stock',
 'days_with_sales',
 'average_if_in_stock',
 'rating',
 'fbs',
 'base_price',
 'category_position',
 'categories_last_count',
 'sales_per_day_average',
 'sales.1',
 'frozen_stocks',
 'frozen_stocks_cost',
 'frozen_stocks_percent',
 'balance',
 'image_name']

In [23]:
pairwise_mapping_df = pd.read_parquet(DATA_PATH + PAIRWISE_TABLE_NAME)
pairwise_mapping_df.columns.tolist()

['sku_query', 'sku_pos', 'sku_hard_neg', 'sku_soft_neg']

# Cluster soft negatives

In [42]:
# List embeddings files in repo
FILTER_STRING = 'name-and-description_embeddings'

from huggingface_hub import list_repo_files

emb_files = [name for name in list_repo_files("INDEEPA/clip-siamese", repo_type="dataset") if FILTER_STRING in name and "OZ_geo_5500" in name]
for file in emb_files:
    display(file)

'embeddings/OZ_geo_5500/OZ_geo_5500_name-and-description_embeddings_num-rows=2.parquet'

'embeddings/OZ_geo_5500/OZ_geo_5500_name-and-description_embeddings_num-rows=5562.parquet'

In [43]:
# Suggest the correct path to the embedding file based on the context and previous file saving logic
CHOSEN_EMBEDDING_FILE = 'OZ_geo_5500_name-and-description_embeddings_num-rows=5562.parquet'

In [47]:
from huggingface_hub import hf_hub_download
import pandas as pd

# Download the chosen embedding file from HuggingFace Hub to DATA_PATH
from pathlib import Path

downloaded_emb_file = hf_hub_download(
    repo_id="INDEEPA/clip-siamese",
    repo_type="dataset",
    filename=f'embeddings/OZ_geo_5500/{CHOSEN_EMBEDDING_FILE}',
    local_dir=DATA_PATH,
)

print(f"Downloaded embedding file to:\n{downloaded_emb_file}")
emb_table = pd.read_parquet(downloaded_emb_file)
emb_table.head()

Downloaded embedding file to:
data/embeddings/OZ_geo_5500/OZ_geo_5500_name-and-description_embeddings_num-rows=5562.parquet


Unnamed: 0,sku,name_desc_emb
0,1871769771,"[-0.020089346915483475, -0.05487045273184776, ..."
1,1679550303,"[-0.00418242160230875, -0.04088427498936653, 0..."
2,1200553001,"[-0.023978281766176224, -0.05447990819811821, ..."
3,922231521,"[-0.024106157943606377, -0.053567297756671906,..."
4,922230517,"[-0.02229023538529873, -0.05309479311108589, -..."


In [56]:
from sklearn.cluster import HDBSCAN
import numpy as np

# Prepare the embeddings as a numpy array
embeddings = np.stack(emb_table['name_desc_emb'].values)

# Run HDBSCAN clustering using sklearn's implementation
# Use coarser clustering: increase min_cluster_size and set min_samples for more robust, larger clusters
clusterer = HDBSCAN(
    min_samples=2,
    metric='cosine',
)

cluster_labels = clusterer.fit_predict(embeddings)

# Add cluster labels to the emb_table and assign to cluster_emb_table
cluster_emb_table = emb_table.copy()
cluster_emb_table['cluster_id'] = cluster_labels

# Print cluster label counts
print("Cluster label counts:")
display(cluster_emb_table['cluster_id'].value_counts().to_frame().T)

Cluster label counts:


cluster_id,-1,396,1,98,216,389,383,159,495,269,...,429,418,12,303,138,424,373,490,428,459
count,1244,64,47,42,38,30,29,27,26,25,...,5,5,5,5,5,5,5,5,5,5


In [57]:
# Print cluster ids with size > N
N = 20  # You can change N to any desired threshold
cluster_counts = cluster_emb_table['cluster_id'].value_counts()
large_clusters = cluster_counts[cluster_counts > N].to_frame()
print(f"Cluster IDs with size > {N}:")
display(large_clusters.T)


Cluster IDs with size > 20:


cluster_id,-1,396,1,98,216,389,383,159,495,269,...,19,492,102,198,427,163,181,384,239,116
count,1244,64,47,42,38,30,29,27,26,25,...,24,23,23,22,22,21,21,21,21,21


In [58]:
# Print SKUs for a given CLUSTER_ID
CLUSTER_ID = 396  # Change this to the desired cluster id

skus_in_cluster = cluster_emb_table.loc[cluster_emb_table['cluster_id'] == CLUSTER_ID, 'sku']
print(f"SKUs in cluster {CLUSTER_ID}:")
display(skus_in_cluster.tolist()[:10])

SKUs in cluster 396:


[1867702733,
 1855727917,
 1849927012,
 1849926941,
 1757684675,
 1747398740,
 1726148392,
 1726148384,
 1596943291,
 1596943084]

# Make pairwise dataset

In [84]:
def split_query_groups_adaptive(
    mapping_df: pd.DataFrame,
    test_size: float = 0.2,
    val_size: float = 0.05,
    random_state: int = 42,
    min_positives_for_3way: int = 6  # minimum positives (including query) for 3-way split
):
    """
    Adaptive splitting based on number of positives:
    - ≥6 positives: train/val/test split
    - 3-5 positives: train/test split (no validation)
    - <3 positives: all in test split only
    """
    rng = np.random.default_rng(random_state)
    split_rows = []

    for _, row in mapping_df.iterrows():
        q = row['sku_query']
        total_positives = len(row['sku_pos'])  # includes query SKU
        
        pos_without_query = set(row['sku_pos']) - {q}
        hard_neg = set(row['sku_hard_neg']) - {q}
        soft_neg = set(row['sku_soft_neg']) - {q}

        def split_list(lst, test_frac, val_frac=None):
            lst = np.array(list(lst))
            n = len(lst)
            if val_frac is None:  # 2-way split
                n_test = int(np.ceil(test_frac * n))
                idx = rng.permutation(n)
                test_idx = idx[:n_test]
                train_idx = idx[n_test:]
                return lst[train_idx].tolist(), [], lst[test_idx].tolist()
            else:  # 3-way split
                n_test = int(np.ceil(test_frac * n))
                n_val = int(np.ceil(val_frac * n))
                idx = rng.permutation(n)
                test_idx = idx[:n_test]
                val_idx = idx[n_test:n_test+n_val]
                train_idx = idx[n_test+n_val:]
                return lst[train_idx].tolist(), lst[val_idx].tolist(), lst[test_idx].tolist()

        if total_positives >= min_positives_for_3way:
            # 3-way split: train/val/test
            pos_train, pos_val, pos_test = split_list(pos_without_query, test_size, val_size)
            hard_train, hard_val, hard_test = split_list(hard_neg, test_size, val_size)
            soft_train, soft_val, soft_test = split_list(soft_neg, test_size, val_size)
            pos_test.append(q)
            
            splits_to_create = ['train', 'val', 'test']
            pos_lists = [pos_train, pos_val, pos_test]
            hard_lists = [hard_train, hard_val, hard_test]
            soft_lists = [soft_train, soft_val, soft_test]
            
        elif total_positives >= 3:
            # 2-way split: train/test only (no validation)
            pos_train, _, pos_test = split_list(pos_without_query, test_size)
            hard_train, _, hard_test = split_list(hard_neg, test_size)
            soft_train, _, soft_test = split_list(soft_neg, test_size)
            pos_test.append(q)
            
            splits_to_create = ['train', 'test']
            pos_lists = [pos_train, pos_test]
            hard_lists = [hard_train, hard_test]
            soft_lists = [soft_train, soft_test]
            
        else:
            # Too few positives: put everything in test split only
            splits_to_create = ['test']
            pos_lists = [list(pos_without_query) + [q]]
            hard_lists = [list(hard_neg)]
            soft_lists = [list(soft_neg)]

        # Create the split rows
        for split_name, pos_list, hard_list, soft_list in zip(
            splits_to_create, pos_lists, hard_lists, soft_lists
        ):
            split_rows.append({
                'sku_query': q,
                'split': split_name,
                'sku_pos': pos_list,
                'sku_hard_neg': hard_list,
                'sku_soft_neg': soft_list
            })

    split_df = pd.DataFrame(split_rows)
    split_dict = {
        split: split_df[split_df['split'] == split].reset_index(drop=True)
        for split in ['train', 'val', 'test'] if split in split_df['split'].values
    }
    return split_dict

In [85]:
splits_dataset = split_query_groups_adaptive(
    pairwise_mapping_df,
    test_size=0.1,
    val_size=0.1,
    random_state=42
)

pd.reset_option('display.max_colwidth')
splits_dataset['test'].head()

Unnamed: 0,sku_query,split,sku_pos,sku_hard_neg,sku_soft_neg
0,1871769771,test,"[467420540, 1871769771]","[1418084594, 1573142945, 1536520050, 1573135817]","[1899881468, 1290396077, 1597431764, 165269677..."
1,1200553001,test,"[945075396, 1436509994, 1436449707, 1438364324...","[1499532091, 963112482, 1422204647, 1122827873...","[1878150702, 1901123430, 1595672507, 679265327..."
2,922231521,test,"[1436509994, 1158222448, 1081199697, 490461399...","[1001260979, 1802254834, 1252814277, 805782980...","[1032263980, 879403681, 1816716304, 1630407222..."
3,922230517,test,"[600803111, 1125093440, 1726148392, 974286048,...","[564434635, 1449544071, 1333611366, 1294181877...","[1807617650, 1113350792, 1245721824, 620961901..."
4,922230183,test,"[1819952117, 1679157969, 914654189, 922230183]","[959054273, 601557360, 1705669581, 950215375, ...","[1634447035, 1706808534, 1438798026, 181995203..."


In [104]:
# Prepare a summary table for each split
import pandas as pd

summary_rows = []
per_query_stats = {}

for split_name, df in splits_dataset.items():
    num_rows = len(df)
    num_pos = df['sku_pos'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
    num_hard = df['sku_hard_neg'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
    num_soft = df['sku_soft_neg'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
    unique_skus = set(df['sku_query'])
    for col in ['sku_pos', 'sku_hard_neg', 'sku_soft_neg']:
        unique_skus.update([sku for sublist in df[col] for sku in (sublist if isinstance(sublist, list) else [])])
    summary_rows.append({
        'split': split_name,
        '#queries': num_rows,
        '#pos': num_pos,
        '#hard_neg': num_hard,
        '#soft_neg': num_soft,
        '#total_sku': len(unique_skus)
    })

    # Per-query stats for each type
    per_query = pd.DataFrame({
        'pos': df['sku_pos'].apply(lambda x: len(x) if isinstance(x, list) else 0),
        'hard_neg': df['sku_hard_neg'].apply(lambda x: len(x) if isinstance(x, list) else 0),
        'soft_neg': df['sku_soft_neg'].apply(lambda x: len(x) if isinstance(x, list) else 0),
    })
    agg = per_query.agg(['mean', 'std', 'min', 'max']).T
    agg.index.name = 'type'
    agg.columns.name = 'agg'
    per_query_stats[split_name] = agg

summary_df = pd.DataFrame(summary_rows)
display(summary_df)

Unnamed: 0,split,#queries,#pos,#hard_neg,#soft_neg,#total_sku
0,train,18,934,3860,76902,5562
1,val,15,125,485,7754,4403
2,test,20,152,690,20338,5562


In [105]:
# Display per-query stats for each split
multiindex_tuples = []
values = []
for split_name, agg in per_query_stats.items():
    for t in agg.index:
        for a in agg.columns:
            multiindex_tuples.append((split_name, t, a))
            values.append(agg.loc[t, a])
multiindex = pd.MultiIndex.from_tuples(multiindex_tuples, names=['split', 'type', 'agg'])
per_query_multi_df = pd.Series(values, index=multiindex).unstack(['type', 'agg']).swaplevel(axis=1)
# The above gives columns as (agg, type), swap to (type, agg)
per_query_multi_df.columns = per_query_multi_df.columns.swaplevel(0,1)
per_query_multi_df = per_query_multi_df.sort_index(axis=1, level=0)

# Cast all columns which are numerical to int (if possible)
for col in per_query_multi_df.columns:
    # Only cast if dtype is numeric and all values are close to integer (to avoid ValueError)
    if pd.api.types.is_numeric_dtype(per_query_multi_df[col]):
        if np.allclose(per_query_multi_df[col].dropna() % 1, 0):
            per_query_multi_df[col] = per_query_multi_df[col].astype(int)

# Reorder columns so that for each type (pos, hard_neg, soft_neg), columns are in order: mean, std, min, max
ordered_types = ['pos', 'hard_neg', 'soft_neg']
ordered_aggs = ['mean', 'std', 'min', 'max']
per_query_multi_df = per_query_multi_df.loc[:, [(t, a) for t in ordered_types for a in ordered_aggs]]
# Sort per_query_multi_df by split: train, val, test
split_order = ['train', 'val', 'test']
per_query_multi_df = per_query_multi_df.reindex(split_order)

display(per_query_multi_df.astype(int))

type,pos,pos,pos,pos,hard_neg,hard_neg,hard_neg,hard_neg,soft_neg,soft_neg,soft_neg,soft_neg
agg,mean,std,min,max,mean,std,min,max,mean,std,min,max
split,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
train,51,65,2,216,214,258,4,755,4272,440,3583,4989
val,8,8,1,28,32,33,1,95,516,40,449,555
test,7,8,2,29,34,48,1,195,1016,1520,449,5558


# Run training

In [204]:
from sklearn.metrics import f1_score                 # ← new
import numpy as np
import torch.nn.functional as F

def best_threshold(distances: torch.Tensor,
                   labels:    torch.Tensor,
                   steps:     int = 200,
                   margin:    float = 1.5):
    """
    Sweep `steps` evenly-spaced thresholds between 0 and `margin`
    and return the one that maximises duplicate-class F1.
    Labels: 0 = duplicate (positive), 1 = different (negative).
    """
    d   = distances.detach().cpu().numpy()
    y   = labels.detach().cpu().numpy()
    thr = np.linspace(0.0, margin, steps)

    best_f1, best_thr = -1.0, 0.0
    for t in thr:
        y_pred = (d < t).astype(int)          # 1 = duplicate prediction
        f1     = f1_score(1 - y, y_pred)      # make 1 = positive for sklearn
        if f1 > best_f1:
            best_f1, best_thr = f1, t
    return best_thr, best_f1


In [175]:

from torch.utils.data import DataLoader, WeightedRandomSampler

def _run():
    images_dir = os.path.join(DATA_PATH, IMG_DATASET_NAME)

    # ---------- 1) build DataLoaders ----------
    splits  = {'train': actual_train_df,
               'validation': actual_val_df,
               'test': actual_test_df}
    loaders = {}

    for split_name, df in splits.items():
        labels = df["label"].values
        ds     = SiameseRuCLIPDataset(df.drop(columns="label"),
                                      labels,
                                      images_dir=images_dir)

        if split_name == "train":
            cls_cnt        = np.bincount(labels, minlength=2)
            cls_weights    = 1.0 / cls_cnt
            sample_weights = cls_weights[labels]
            total = len(sample_weights)
            max_n = MAX_SAMPLES_PER_EPOCH or total
            n_samples = min(total, max_n)

            sampler = WeightedRandomSampler(
                sample_weights,
                num_samples=n_samples,
                replacement=True
            )
            loaders[split_name] = DataLoader(
                ds,
                batch_size=BATCH_SIZE,
                sampler=sampler,
                num_workers=NUM_WORKERS
            )
        else:
            loaders[split_name] = DataLoader(
                ds,
                batch_size=BATCH_SIZE,
                shuffle=False,
                num_workers=NUM_WORKERS
            )

    train_loader = loaders['train']
    valid_loader = loaders['validation']
    test_loader  = loaders['test']

    # ---------- 2) model / optimiser ----------
    print("Loading model and optimizer…")
    model = SiameseRuCLIP(
        DEVICE, NAME_MODEL_NAME,
        DESCRIPTION_MODEL_NAME,
        PRELOAD_MODEL_NAME,
        DATA_PATH + RESULTS_DIR,
        dropout=DROPOUT
    ).to(DEVICE)

    criterion = ContrastiveLoss(
        margin=CONTRASTIVE_MARGIN,
        pos_weight=POS_WEIGHT
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY
    )
    print("Done.")

    # ---------- 3) training ----------
    with tempfile.TemporaryDirectory() as tmp_ckpt_dir:
        (train_losses, val_losses,
         best_metric_val, best_weights, thr_history) = train(
            model, optimizer, criterion,
            EPOCHS, train_loader, valid_loader,
            print_epoch=True, device=DEVICE,
            models_dir=tmp_ckpt_dir,
            metric=BEST_CKPT_METRIC
        )

    print(f"→ Best validation {BEST_CKPT_METRIC}: {best_metric_val:.3f}")

    # ---------- 4) loss curves ----------
    epochs_ax = list(range(1, len(train_losses) + 1))
    fig, ax = plt.subplots()
    ax.plot(epochs_ax, train_losses, label='Train Loss')
    ax.plot(epochs_ax, val_losses,   label='Val   Loss')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title('Training & Validation Loss by Epoch')
    ax.legend()
    if MLFLOW_URI:
        mlflow.log_figure(fig, 'loss_by_epoch.png')
    display.clear_output(wait=True)
    display.display(fig)
    plt.close(fig)

    # ---------- 5) pick threshold for the *best* model ----------
    model.load_state_dict(best_weights)
    (_, _, _, _, _, best_thr) = validation(
        model, criterion, valid_loader,
        epoch='best', device=DEVICE,
        split_name='validation',
        threshold=None,
        metric=BEST_CKPT_METRIC
    )
    print(f"Chosen threshold from validation: {best_thr:.3f}")

    # ---------- 6) final TEST ----------
    (test_pos_acc, test_neg_acc,
     test_acc, test_f1,
     test_loss, _) = validation(
        model, criterion, test_loader,
        epoch='test', device=DEVICE,
        split_name='test',
        threshold=best_thr,
        metric=BEST_CKPT_METRIC
    )

    # pick out the right test-metric value
    test_metric = test_pos_acc if BEST_CKPT_METRIC == 'pos_acc' else test_f1
    print(f"Test {BEST_CKPT_METRIC}: {test_metric:.3f}")

    # ---------- 7) save checkpoint ----------
    filename = (
        f"siamese_contrastive_test-{BEST_CKPT_METRIC}={test_metric:.3f}"
        f"{'_' + MODEL_NAME_POSTFIX if MODEL_NAME_POSTFIX else ''}"
        f"{'_' + PRELOAD_MODEL_NAME  if PRELOAD_MODEL_NAME else ''}"
        f"_best-thr={best_thr:.3f}.pt"
    )
    final_path = Path(DATA_PATH + RESULTS_DIR) / filename
    final_path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(best_weights, final_path)
    print(f"Saved best‐{BEST_CKPT_METRIC} checkpoint to {final_path}")

    if MLFLOW_URI:
        mlflow.log_metric("test_pos_accuracy", test_pos_acc)
        mlflow.log_metric("test_neg_accuracy", test_neg_acc)
        mlflow.log_metric("test_accuracy",     test_acc)
        mlflow.log_metric("test_f1_score",     test_f1)
        mlflow.end_run()

_run()

  cls_weights    = 1.0 / cls_cnt


Loading model and optimizer…
Done.


KeyboardInterrupt: 