# Log into services

In [1]:
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [2]:
# Use tokens from .env

import os
from dotenv import load_dotenv

import huggingface_hub
import wandb

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
huggingface_hub.login(token=HF_TOKEN)

WANDB_API_KEY = os.getenv("WANDB_API_KEY")
wandb.login(key=WANDB_API_KEY)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/anton/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtony-pitchblack[0m ([33moverfit1010[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Imports

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


from timm import create_model
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
from torch import optim, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
# from torchinfo import summary
import transformers
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer,\
        get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer

import cv2

from PIL import Image
from tqdm.auto import tqdm

import json
from itertools import product

# import datasets
# from datasets import Dataset, concatenate_datasets
import argparse
import requests

from io import BytesIO
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score
import matplotlib.pyplot as plt
from IPython import display
import more_itertools

# Parameters

In [4]:
TABLE_DATASET_FILE = 'new_labeled.csv'
IMG_DATASET_NAME = 'images_7k'
BATCH_SIZE = 8

DATA_PATH = 'data/'
DEVICE='cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
model_configs = [
    dict(
        MODEL_CKPT = 'siamese_fitted_10epochs_bert_turbo.pt',
        NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',
        DESCRIPTION_MODEL_NAME = 'sergeyzh/rubert-tiny-turbo',
    ),
    
    dict(
        MODEL_CKPT = 'siamese_fitted_10epochs_bert_tiny.pt',
        NAME_MODEL_NAME = 'DeepPavlov/distilrubert-tiny-cased-conversational-v1',
        DESCRIPTION_MODEL_NAME = 'cointegrated/rubert-tiny',
    )
]


# Source code

### RuCLIPtiny

In [6]:
class RuCLIPtiny(nn.Module):
    def __init__(self, name_model_name: str):
        """
        Initializes the RuCLIPtiny module using the provided name model.
        """
        super().__init__()
        self.visual = create_model('convnext_tiny',
                                   pretrained=False,  # set True if you want pretrained weights
                                   num_classes=0,
                                   in_chans=3)       # output: e.g. 768-dim features
        
        self.transformer = AutoModel.from_pretrained(name_model_name)
        name_model_output_size = self.transformer.config.hidden_size  # inferred dynamically
        self.final_ln = nn.Linear(name_model_output_size, 768)         # project to 768 dims
        self.logit_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07)))
    
    @property
    def dtype(self):
        return self.visual.stem[0].weight.dtype

    def encode_image(self, image: torch.Tensor) -> torch.Tensor:
        return self.visual(image.type(self.dtype))

    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        x = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        # use the CLS token (first token)
        x = x.last_hidden_state[:, 0, :]
        x = self.final_ln(x)
        return x

    def forward(self, image: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        image_features = self.encode_image(image)
        text_features = self.encode_text(input_ids, attention_mask)
        # Normalize features
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()
        return logits_per_image, logits_per_text


In [7]:
def get_transform():
    return transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        lambda image: image.convert("RGB"),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

class Tokenizers:
    def __init__(self, name_model_name: str, description_model_name: str):
        self.name_tokenizer = AutoTokenizer.from_pretrained(name_model_name)
        self.desc_tokenizer = AutoTokenizer.from_pretrained(description_model_name)

    def tokenize_name(self, texts, max_len=77):
        tokenized = self.name_tokenizer.batch_encode_plus(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])

    def tokenize_description(self, texts, max_len=77):
        tokenized = self.desc_tokenizer(
            texts,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return torch.stack([tokenized["input_ids"], tokenized["attention_mask"]])


In [8]:
class SiameseRuCLIPDataset(torch.utils.data.Dataset):
    def __init__(self, images_dir: str, name_model_name: str, description_model_name: str, df=None, labels=None, df_path=None):
        """
        Dataset requires the concrete models' names for tokenization.
        """
        self.df = pd.read_csv(df_path) if df_path is not None else df
        self.labels = labels
        self.images_dir = images_dir
        self.tokenizers = Tokenizers(name_model_name, description_model_name)
        self.transform = get_transform()
        self.max_len = 77

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Tokenize names
        name_tokens = self.tokenizers.tokenize_name([str(row.name_first), str(row.name_second)], max_len=self.max_len)
        name_first = name_tokens[:, 0, :]  # [input_ids, attention_mask]
        name_second = name_tokens[:, 1, :]
        # Tokenize descriptions
        desc_tokens = self.tokenizers.tokenize_description([str(row.description_first), str(row.description_second)])
        desc_first = desc_tokens[:, 0, :]
        desc_second = desc_tokens[:, 1, :]
        # Process images
        im_first = cv2.imread(os.path.join(self.images_dir, row.image_name_first))
        im_first = cv2.cvtColor(im_first, cv2.COLOR_BGR2RGB)
        im_first = Image.fromarray(im_first)
        im_first = self.transform(im_first)
        im_second = cv2.imread(os.path.join(self.images_dir, row.image_name_second))
        im_second = cv2.cvtColor(im_second, cv2.COLOR_BGR2RGB)
        im_second = Image.fromarray(im_second)
        im_second = self.transform(im_second)
        label = self.labels[idx]
        return im_first, name_first, desc_first, im_second, name_second, desc_second, label

    def __len__(self):
        return len(self.df)

### SiameseRuCLIP

In [9]:
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(
            ~attention_mask[..., None].bool(), 0.0
        )
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

class SiameseRuCLIP(nn.Module):
    def __init__(self,
                 device: str,
                 name_model_name: str,
                 description_model_name: str,
                 models_dir: str = None,
                 preload_ruclip: bool = None,
                 preload_model_name: str = None):
        """
        Initializes the SiameseRuCLIP model.
        Required parameters:
          - models_dir: directory containing saved checkpoints.
          - name_model_name: model name for text (name) branch.
          - description_model_name: model name for description branch.
        """
        super().__init__()
        # Initialize RuCLIPtiny with the provided name model.
        self.ruclip = RuCLIPtiny(name_model_name)
        if preload_ruclip:
            std = torch.load(os.path.join(models_dir, preload_model_name),
                             weights_only=True,
                             map_location=device)
            self.ruclip.load_state_dict(std)
            self.ruclip = self.ruclip.to(device)
            self.ruclip.eval()
        # Initialize description transformer with the provided description model.
        self.description_transformer = AutoModel.from_pretrained(description_model_name)
        
        # Infer dimensions automatically from inner modules.
        vision_dim = self.ruclip.visual.num_features            # e.g. 768 from ConvNeXt tiny
        name_dim = self.ruclip.final_ln.out_features              # e.g. 768 after projection
        desc_dim = self.description_transformer.config.hidden_size  # e.g. 312 for cointegrated/rubert-tiny
        per_product_dim = vision_dim + name_dim + desc_dim         # total per–product embedding, e.g., 768+768+312 = 1848
        head_input_dim = 2 * per_product_dim                      # two products concatenated
        
        self.hidden_dim = per_product_dim
        
        self.head = nn.Sequential(
            nn.Linear(head_input_dim, head_input_dim // 2),
            nn.ReLU(),
            nn.Linear(head_input_dim // 2, head_input_dim // 4),
            nn.ReLU(),
            nn.Linear(head_input_dim // 4, 2)
        )
    
    def encode_description(self, desc):
        # desc: [input_ids, attention_mask]
        out = self.description_transformer(desc[:, 0, :], desc[:, 1, :])
        last_hidden = out.last_hidden_state
        attention_mask = desc[:, 1, :]
        # Average pooling over token representations.
        return (last_hidden * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
    
    def forward(self, im1, name1, desc1, im2, name2, desc2):
        image_emb1 = self.ruclip.encode_image(im1)
        image_emb2 = self.ruclip.encode_image(im2)
        name_emb1 = self.ruclip.encode_text(name1[:, 0, :], name1[:, 1, :])
        name_emb2 = self.ruclip.encode_text(name2[:, 0, :], name2[:, 1, :])
        desc_emb1 = self.encode_description(desc1)
        desc_emb2 = self.encode_description(desc2)
        first_emb = torch.cat([image_emb1, name_emb1, desc_emb1], dim=1)
        second_emb = torch.cat([image_emb2, name_emb2, desc_emb2], dim=1)
        x = torch.cat([first_emb, second_emb], dim=1)
        out = self.head(x)
        return out

# Evaluation loop

## Run evaluation

In [10]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"
LOCAL_DIR = Path("data/train_results")
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        "train_results/siamese_fitted*.pt",
        TABLE_DATASET_FILE,
        f"{IMG_DATASET_NAME}.zip"
    ],
)

!unzip -o -q data/{IMG_DATASET_NAME}.zip -d data/

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
# Load data

labeled = pd.read_csv(DATA_PATH + TABLE_DATASET_FILE)
images_dir = DATA_PATH + IMG_DATASET_NAME

y = labeled.label.values
X = labeled.drop(columns='label').copy()

def load_data(model_config):
    test_ds = SiameseRuCLIPDataset(
        images_dir,
        model_config['NAME_MODEL_NAME'], 
        model_config['DESCRIPTION_MODEL_NAME'], 
        X, y
    )
    test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
    return test_dl

# model_config = model_configs[1]
# test_dl = load_data(model_config)

In [12]:
# Load model

from pathlib import Path

def load_model(model_config):
    ckpt_name = model_config['MODEL_CKPT']
    model_ckpt_path = Path(DATA_PATH) / 'train_results' / ckpt_name
    std = torch.load(model_ckpt_path, map_location=DEVICE)

    # Initialize the model using the configuration.
    model = SiameseRuCLIP(
        name_model_name=model_config["NAME_MODEL_NAME"],
        description_model_name=model_config["DESCRIPTION_MODEL_NAME"],
        device=DEVICE,
    )

    model.load_state_dict(std)
    return model

# model = load_model(model_config)

In [None]:
# Get evaluation score

def validation(model, valid_loader, score, device='cpu') -> float:
    correct_val = 0
    with torch.no_grad(): 
        model.eval()
        for data in tqdm(valid_loader):
            im1, name1, desc1, im2, name2, desc2, label = data 
            im1, name1, desc1, im2, name2, desc2 = im1.to(device), name1.to(device), desc1.to(device), im2.to(device), name2.to(device), desc2.to(device)
            out = model(im1, name1, desc1, im2, name2, desc2) 
            _, predicted = torch.max(out.data, -1)
            predicted = predicted.cpu().numpy()
            correct_val += score(label, predicted)
            # break
    return correct_val / len(valid_loader)

# test_score = validation(model, test_dl, f1_score)

In [14]:
import wandb

def log_to_wandb(model_config, test_score):
    wandb.init(
        project="product-matching",
        entity="overfit1010",
        name=f"test-{model_config['MODEL_CKPT']}",
        config={
            "table_dataset_file": TABLE_DATASET_FILE,
            "img_dataset_name": IMG_DATASET_NAME,
            "model_ckpt": model_config['MODEL_CKPT'],
            "name_model_name": model_config['NAME_MODEL_NAME'],
            "description_model_name": model_config['DESCRIPTION_MODEL_NAME'],
            "model_type": "siamese",
            "run_type": "test"
        }
    )

    # Optional: log summary metric
    wandb.summary["test.f1_score"] = test_score
    wandb.finish()

# log_to_wandb(model_config, test_score)

In [15]:
for model_config in model_configs:
    model = load_model(model_config)
    test_dl = load_data(model_config)
    test_score = validation(model, test_dl, f1_score)
    log_to_wandb(model_config, test_score)

  0%|          | 0/621 [00:00<?, ?it/s]

0,1
test.f1_score,0.00146


  0%|          | 0/621 [00:00<?, ?it/s]

0,1
test.f1_score,0.00161
