## Step 1: Pull and Run Qdrant Docker Image

In [None]:
!docker run -d --name qdrant-db -p 6333:6333 -p 6334:6334 qdrant/qdrant:latest

## Step 2: Install Required Python Packages

In [None]:
%pip install -U transformers FlagEmbedding accelerate
%pip install pandas

## Step 3: Load Products Dataset

In [13]:
import pandas as pd
import json

# Load the product dataset
products_df = pd.read_csv('products.csv', sep='|')

# Convert dataframe to list of dictionaries (JSON format)
products_json = products_df.to_dict(orient='records')

# Print the first product as JSON
print(json.dumps(products_json[0], indent=2))

# Print total number of products
print(f"Total products: {len(products_json)}")

{
  "Id": "d2559c95-bd28-49d8-b53a-538c34a25bcb",
  "Name": "Saucony Men's Kinvara 13 Running Shoe",
  "Description": "When it comes to lightweight speed, nothing crushes the competition like the Kinvara. And this just so happens to be our lightest one yet. With more speed contouring and its signature flexible feel, it pushes you forward without holding anything back. These are the shoes you\u2019ll do big things in.",
  "Price": 600.93,
  "PriceCurrency": "USD",
  "SupplyAbility": 396,
  "MinimumOrder": 574
}
Total products: 751


## Step 4: Initialize the BGE-M3 model

In [15]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

## Step 5: Calculate Embeddings for Product

In [37]:
# Let's take a single product as an example
sample_product = products_df.iloc[0]

def create_product_text(product):
    return f"Product: {product['Name']}\nDescription: {product['Description']}"

product_text = create_product_text(sample_product)
print("\nFormatted product text:")
print(product_text)

# Generate all three types of embeddings
output = model.encode(
    [product_text], 
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=True
)

dense_vector = output['dense_vecs'][0]
sparse_weights = output['lexical_weights'][0]
colbert_vectors = output['colbert_vecs'][0]


Formatted product text:
Product: Saucony Men's Kinvara 13 Running Shoe
Description: When it comes to lightweight speed, nothing crushes the competition like the Kinvara. And this just so happens to be our lightest one yet. With more speed contouring and its signature flexible feel, it pushes you forward without holding anything back. These are the shoes you’ll do big things in.


### Displaying dense vectors

In [None]:
print("Dense vector information:")
print(f"Shape: {dense_vector.shape}")
print(f"First 5 elements: {dense_vector[:5]}")

Dense vector information:
Shape: (1024,)
First 5 elements: [-0.02016  0.02016 -0.0438   0.02206 -0.01973]


### Displaying sparse vectors

In [38]:
# Display sparse vector
print("Sparse vector information:")
print(f"Number of tokens with weights: {len(sparse_weights)}")

# Display a sample of token IDs and weights
print("\nSample of token IDs and weights:")
sparse_items = list(sparse_weights.items())[:10]  # First 10 items
for token_id, weight in sparse_items:
    print(f"  Token ID {token_id}: {weight:.4f}")

# Convert token IDs to actual tokens and sort by weight
token_map = model.convert_id_to_token([sparse_weights])
sorted_tokens = sorted(token_map.items(), key=lambda x: float(x[1]), reverse=True)

# Display top tokens by weight
print("\nTop 10 tokens by weight:")
for token, weight in sorted_tokens[:10]:
    print(f"  {token}: {float(weight):.4f}")

Sparse vector information:
Number of tokens with weights: 57

Sample of token IDs and weights:
  Token ID 73111: 0.1993
  Token ID 12: 0.0215
  Token ID 6987: 0.1790
  Token ID 2271: 0.1504
  Token ID 53: 0.2017
  Token ID 1111: 0.1603
  Token ID 25: 0.0012
  Token ID 7: 0.0244
  Token ID 37029: 0.1849
  Token ID 7113: 0.3137

Top 10 tokens by weight:
  vara: 0.3137
  shoes: 0.2365
  13: 0.2229
  weight: 0.2102
  speed: 0.2094
  y: 0.2017
  Product: 0.1993
  Sho: 0.1898
  Kin: 0.1849
  Sau: 0.1790


### Displaying ColBERT vectors

In [39]:
# Display ColBERT vectors
print("ColBERT vectors information:")
print(f"Shape: {colbert_vectors.shape}")
print(f"Number of token vectors: {len(colbert_vectors)}")
print(f"Vector dimension: {colbert_vectors.shape[1]}")

print("\nSample of first 5 token vectors (first 3 dimensions each):")
for i in range(min(5, len(colbert_vectors))):
    values = colbert_vectors[i][:3].tolist() if hasattr(colbert_vectors[i][:3], 'tolist') else colbert_vectors[i][:3]
    print(f"  Vector {i}: {values}")

ColBERT vectors information:
Shape: (82, 1024)
Number of token vectors: 82
Vector dimension: 1024

Sample of first 5 token vectors (first 3 dimensions each):
  Vector 0: [-0.015277110040187836, -0.05877283960580826, 0.0036948397755622864]
  Vector 1: [-0.035025328397750854, -0.05892220884561539, -0.020987864583730698]
  Vector 2: [-0.00875769555568695, -0.03164244443178177, -0.007193501573055983]
  Vector 3: [-0.04171895608305931, -0.015494350343942642, -0.004048152826726437]
  Vector 4: [-0.052237991243600845, -0.007237554062157869, 0.0014343017246574163]
