# Installs & tokens

In [27]:
%%capture
try:
    import dotenv
except ImportError:
    !pip install python-dotenv

In [28]:
# Log into huggingface via Kaggle Secrets or .env

import os
from dotenv import load_dotenv
import huggingface_hub

try:
    from kaggle_secrets import UserSecretsClient

    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
except ModuleNotFoundError:
    print("Not Kaggle environment. Skipping Kaggle secrets.")
    print("Trying to load HF_TOKEN from .env.")
    load_dotenv()
    HF_TOKEN = os.getenv("HF_TOKEN")
    print("Success!")

huggingface_hub.login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Not Kaggle environment. Skipping Kaggle secrets.
Trying to load HF_TOKEN from .env.
Success!


# Notebook parameters

In [29]:
DATA_PATH = 'data/'
SOURCE_TABLE_NAME = 'tables_OZ_geo_5500/processed/OZ_geo_5500.csv'

In [30]:
import torch

EMBEDDING_MODEL_NAME = 'sergeyzh/LaBSE-ru-turbo'

if torch.cuda.is_available():
    EMB_BATCH_SIZE = 512
    NUM_EMBS = None
    DEVICE = 'cuda'
else:
    EMB_BATCH_SIZE = 1
    NUM_EMBS = 2
    DEVICE = 'cpu'

# Download files

In [31]:
# Download models' weights & text/image datasets

from huggingface_hub import snapshot_download
from pathlib import Path

REPO_ID = "INDEEPA/clip-siamese"

_ = snapshot_download(
    repo_id=REPO_ID,
    repo_type='dataset',
    local_dir='data',
    allow_patterns=[
        SOURCE_TABLE_NAME,
    ],
)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
import pandas as pd
source_df = pd.read_csv(DATA_PATH + SOURCE_TABLE_NAME)
print(source_df.shape)
source_df[['sku', 'name', 'description']].head(1)

(5562, 46)


Unnamed: 0,sku,name,description
0,1871769771,"Карты МИРА и РОССИИ настенные политические,160...",Представляем вашему вниманию уникальный набор ...


In [33]:
# concatenate name & description

source_df['name_and_description'] = source_df['name'].fillna('') + '.\n' + source_df['description'].fillna('')
display(source_df[['sku', 'name_and_description']].head())

Unnamed: 0,sku,name_and_description
0,1871769771,"Карты МИРА и РОССИИ настенные политические,160..."
1,1679550303,Схема линий скоростного транспорта Москвы (Мет...
2,1200553001,"Политическая карта МИРА 160х109 см, Карта мира..."
3,922231521,"Политическая карта МИРА настенная, 100х70см, ш..."
4,922230517,"Политическая карта МИРА настенная, 160х102см, ..."


# Compute embeddings

In [34]:
# --- Compute and save embeddings for all SKUs in source_df ---
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from pathlib import Path
import torch

all_skus_df = source_df.copy()
if NUM_EMBS is not None:
    all_skus_df = all_skus_df.head(NUM_EMBS)

model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)

emb_table = all_skus_df[['sku', 'name_and_description']].copy().reset_index(drop=True)
candidate_texts = emb_table['name_and_description'].astype(str).tolist()

embeddings = model.encode(
    candidate_texts,
    batch_size=EMB_BATCH_SIZE,
    show_progress_bar=True
)

emb_table['name_desc_emb'] = [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
emb_table = emb_table[['sku', 'name_desc_emb']]
display(emb_table.head())

# Save to parquet
file_dir = Path('embeddings/OZ_geo_5500')
file_name = f"{Path(SOURCE_TABLE_NAME).stem}_name-and-description_embeddings_num-rows={len(emb_table)}.parquet"
full_file_path = Path(DATA_PATH) / file_dir / file_name
full_file_path.parent.mkdir(parents=True, exist_ok=True)

emb_table.to_parquet(full_file_path, index=False)
print(f"Saved embeddings to:\n{file_dir / file_name}")

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,sku,name_desc_emb
0,1871769771,"[-0.020089328289031982, -0.05487040802836418, ..."
1,1679550303,"[-0.004182410426437855, -0.040884267538785934,..."


Saved embeddings to:
embeddings/OZ_geo_5500/OZ_geo_5500_name-and-description_embeddings_num-rows=2.parquet


In [35]:
# Upload embeddings to HF

import os
from dotenv import load_dotenv
from huggingface_hub import HfApi, login

# Load HF_TOKEN from .env
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN not found in .env file")

# Log into HuggingFace
login(token=hf_token)

# Upload the folder
api = HfApi()
api.upload_folder(
    folder_path=DATA_PATH / file_dir,  # Path to the local directory
    path_in_repo=str(file_dir),
    repo_id="INDEEPA/clip-siamese",
    repo_type="dataset",
)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/INDEEPA/clip-siamese/commit/29c32eb5082a972001f154f7a356221ded363ce6', commit_message='Upload folder using huggingface_hub', commit_description='', oid='29c32eb5082a972001f154f7a356221ded363ce6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/INDEEPA/clip-siamese', endpoint='https://huggingface.co', repo_type='dataset', repo_id='INDEEPA/clip-siamese'), pr_revision=None, pr_num=None)