## Step 1: Pull and Run Qdrant Docker Image

In [None]:
!docker run -d --name qdrant-db -p 6333:6333 -p 6334:6334 qdrant/qdrant:latest

## Step 2: Install Required Python Packages

In [None]:
%pip install -U transformers FlagEmbedding accelerate
%pip install pandas
%pip install qdrant_client

## Step 3: Load Products Dataset

In [52]:
import pandas as pd
import json

products_df = pd.read_csv('products.csv', sep='|')

products_json = products_df.to_dict(orient='records')

# Print the first product as JSON
print(json.dumps(products_json[0], indent=2))

print(f"Total products: {len(products_json)}")

{
  "Id": "d2559c95-bd28-49d8-b53a-538c34a25bcb",
  "Name": "Saucony Men's Kinvara 13 Running Shoe",
  "Description": "When it comes to lightweight speed, nothing crushes the competition like the Kinvara. And this just so happens to be our lightest one yet. With more speed contouring and its signature flexible feel, it pushes you forward without holding anything back. These are the shoes you\u2019ll do big things in.",
  "Price": 600.93,
  "PriceCurrency": "USD",
  "SupplyAbility": 396,
  "MinimumOrder": 574
}
Total products: 751


## Step 4: Initialize the BGE-M3 model

In [3]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)



Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

## Step 5: Calculate Embeddings for Product

In [53]:
# Let's take a single product as an example
sample_product = products_df.iloc[0]

def create_product_text(product):
    return f"Product: {product['Name']}\nDescription: {product['Description']}"

product_text = create_product_text(sample_product)
print("\nFormatted product text:")
print(product_text)

# Generate all three types of embeddings
output = model.encode(
    [product_text], 
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=True
)

dense_vector = output['dense_vecs'][0]
sparse_weights = output['lexical_weights'][0]
colbert_vectors = output['colbert_vecs'][0]


Formatted product text:
Product: Saucony Men's Kinvara 13 Running Shoe
Description: When it comes to lightweight speed, nothing crushes the competition like the Kinvara. And this just so happens to be our lightest one yet. With more speed contouring and its signature flexible feel, it pushes you forward without holding anything back. These are the shoes you’ll do big things in.


### Displaying dense vectors

In [54]:
print("Dense vector information:")
print(f"Shape: {dense_vector.shape}")
print(f"First 5 elements: {dense_vector[:5]}")

Dense vector information:
Shape: (1024,)
First 5 elements: [-0.02016  0.02016 -0.0438   0.02206 -0.01973]


### Displaying sparse vectors

In [55]:
print("Sparse vector information:")
print(f"Number of tokens with weights: {len(sparse_weights)}")

# Convert token IDs to readable tokens
token_map = model.convert_id_to_token([sparse_weights])

# Display top tokens by weight
print("\nTop 10 tokens by weight:")
for token, weight in sorted(token_map.items(), key=lambda x: float(x[1]), reverse=True)[:10]:
    print(f"  {token}: {float(weight):.4f}")

Sparse vector information:
Number of tokens with weights: 57

Top 10 tokens by weight:
  vara: 0.3137
  shoes: 0.2365
  13: 0.2229
  weight: 0.2102
  speed: 0.2094
  y: 0.2017
  Product: 0.1993
  Sho: 0.1898
  Kin: 0.1849
  Sau: 0.1790


### Displaying ColBERT vectors

In [56]:
print(f"ColBERT vectors: {colbert_vectors.shape} (tokens × dimensions)")
print("\nFirst 5 token vectors (first 3 dimensions each):")
for i in range(min(5, len(colbert_vectors))):
    print(f"  Token {i}: {colbert_vectors[i][:3].tolist()}")

ColBERT vectors: (82, 1024) (tokens × dimensions)

First 5 token vectors (first 3 dimensions each):
  Token 0: [-0.015277110040187836, -0.05877283960580826, 0.0036948397755622864]
  Token 1: [-0.035025328397750854, -0.05892220884561539, -0.020987864583730698]
  Token 2: [-0.00875769555568695, -0.03164244443178177, -0.007193501573055983]
  Token 3: [-0.04171895608305931, -0.015494350343942642, -0.004048152826726437]
  Token 4: [-0.052237991243600845, -0.007237554062157869, 0.0014343017246574163]


## Step 5: Calculate Embeddings for all products

In [None]:
from tqdm.notebook import tqdm

all_product_embeddings = []

# Process all products with progress bar
for product in tqdm(products_json):
    # Format product text
    product_text = create_product_text(product)

    output = model.encode(
        [product_text], 
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=True
    )

    product_embedding = {
        "product": product,
        "dense_vector": output['dense_vecs'][0],
        "sparse_weights": output['lexical_weights'][0],
        "colbert_vectors": output['colbert_vecs'][0]
    }
    
    all_product_embeddings.append(product_embedding)

print(f"Generated embeddings for {len(all_product_embeddings)} products")

  0%|          | 0/751 [00:00<?, ?it/s]

Generated embeddings for 751 products


## Step 6: Create Qdrant collection for dense, sparse and multivectors

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient("localhost", port=6333)  # Adjust host/port as needed

# Create collection with dense vectors, sparse vectors, and ColBERT multi-vectors
client.create_collection(
    collection_name="products",
    vectors_config={
        "dense": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE
        ),
        "colbert": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM
            ),
        )
    },
    sparse_vectors_config={
        "sparse": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=True
            )
        )
    },
)

print("Collection 'products' created successfully")

## Step 7: Insert embeddings in qdrant collection

### Function to create a qdrant sparse vector from bge-m3 output

In [27]:
def create_sparse_vector(sparse_data):
    sparse_indices = []
    sparse_values = []
    
    for key, value in sparse_data.items():
        # Only process positive values
        if float(value) > 0:
            # Handle string keys
            if isinstance(key, str):
                if key.isdigit():
                    key = int(key)
                else:
                    continue
                
            sparse_indices.append(key)
            sparse_values.append(float(value))
    
    return models.SparseVector(
        indices=sparse_indices,
        values=sparse_values
    )

### Inserting products in collection

In [None]:
from qdrant_client import models
from tqdm import tqdm

# Function to insert product embeddings to Qdrant
def insert_products_to_qdrant(product_embeddings):
    for embedding in tqdm(product_embeddings):
        product = embedding["product"]
        dense_vector = embedding["dense_vector"]
        colbert_vectors = embedding["colbert_vectors"]
        sparse_data = embedding["sparse_weights"]

        qdrant_sparse = create_sparse_vector(sparse_data)
        
        client.upsert(
            collection_name="products",
            points=[
                models.PointStruct(
                    id=product["Id"],
                    payload=product,
                    vector={
                        "dense": dense_vector,
                        "colbert": colbert_vectors,
                        "sparse": qdrant_sparse
                    }
                )
            ]
        )
    
    print(f"Successfully inserted {len(product_embeddings)} products into the 'products' collection")

insert_products_to_qdrant(all_product_embeddings)

100%|██████████| 751/751 [01:43<00:00,  7.27it/s]

Successfully inserted 751 products into the 'products' collection





## Step 8: Retrieve & re-rank based on search query

In [38]:
def search_products(search_query, limit=3, prefetch_limit=6):
    query_outputs = model.encode(
        [search_query],
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=True
    )
    
    dense_vec = query_outputs["dense_vecs"][0]
    sparse_vec = query_outputs["lexical_weights"][0]
    colbert_vec = query_outputs["colbert_vecs"][0]
    
    qdrant_sparse = create_sparse_vector(sparse_vec)
    
    # Set up prefetch for hybrid search
    prefetch = [
        models.Prefetch(
            query=qdrant_sparse,
            using="sparse",
            limit=prefetch_limit),
        models.Prefetch(
            query=dense_vec,
            using="dense",
            limit=prefetch_limit)
    ]
    
    # Perform reranking with ColBERT
    results = client.query_points(
        "products",
        prefetch=prefetch,
        query=colbert_vec,
        using="colbert",
        with_payload=True,
        limit=limit,
    )
    
    return results

### Function to display search results

In [46]:
def display_search_results(results):
    print(f"Found {len(results.points)} matching products")
    print("-" * 40)
    
    for i, result in enumerate(results.points):    
        product = result.payload
        print(f"{i+1}. {product['Name']} - Score: {result.score:.2f}")
        print(f"   Price: {product['Price']} {product['PriceCurrency']}")
        
        # Truncate description if too long
        description = product.get('Description', '')
        if description and len(description) > 100:
            description = description[:97] + "..."
        
        if description:
            print(f"   {description}")
        print()

### A few examples

In [47]:
result = search_products("running shoes for men")

display_search_results(result)


Found 3 matching products
----------------------------------------
1. adidas Men's Ultraboost Personal Best Running Shoe - Score: 3.76
   Price: 626.1 USD
   Men's shoes - low (non football).

2. adidas Men's Racer Tr21 Running Shoe - Score: 3.51
   Price: 413.1 EUR
   Everyday style with a running twist. These men's adidas sneakers have a Cloudfoam midsole for ste...

3. Under Armour Men's Charged Assert 9 Running Shoe - Score: 3.49
   Price: 434.94 USD
   These running shoes are built to help anyone go faster-Charged Cushioning® helps protect against ...



In [48]:
result = search_products("nintendo")

display_search_results(result)

Found 3 matching products
----------------------------------------
1. Nintendo Switch OLED Model - Score: 2.41
   Price: 349.99 USD
   The Nintendo Switch OLED Model offers a vivid 7-inch OLED display and enhanced audio for portable...

2. NZND Case for At&t Motivate 3 (3rd Version)/ Motivate 2 (2nd)/ Cricket Icon 3/ Cricket Splendor with Tempered Glass Screen Protector, Ring Holder/Wrist Strap, Glitter Liquid Cute Case (Pink/Aqua) - Score: 2.10
   Price: 84.09 USD
   NZND Glitter Flowing Liquid Floating Gradient Colorful with Sparkling Bling Diamond, Durable Girl...

3. Samsung 980 PRO 1TB NVMe M.2 SSD - Score: 1.97
   Price: 149.99 USD
   The Samsung 980 PRO offers lightning-fast read and write speeds, ideal for high-performance gamin...



In [49]:
result = search_products("xbox gamepad")

display_search_results(result)

Found 3 matching products
----------------------------------------
1. PowerA Advantage Wired Controller for Xbox Series X|S with Lumectra + RGB LED Strip - Black, gamepad, wired video game controller, gaming controller, works with Xbox One and Windows 10/11, Officially Licensed for Xbox - Score: 3.36
   Price: 473.73 EUR
   Illuminate the possibilities with the PowerA Advantage Wired Controller for Xbox Series X|S with ...

2. Microsoft Xbox Series X - Score: 2.79
   Price: 499.99 USD
   The Microsoft Xbox Series X offers powerful gaming performance with 4K resolution, lightning-fast...

3. Microsoft Xbox Series X - Score: 2.74
   Price: 499.99 USD
   The Xbox Series X is a powerful next-gen gaming console with 4K UHD resolution, fast loading spee...



In [51]:
result = search_products("samsung s22")

display_search_results(result)

Found 3 matching products
----------------------------------------
1. Samsung Galaxy S22 Ultra - Score: 2.96
   Price: 1199.99 USD
   The Samsung Galaxy S22 Ultra features a 108MP camera, 5G support, and a large AMOLED display, off...

2. Samsung Galaxy S22 5G - Score: 2.91
   Price: 799.99 USD
   The Samsung Galaxy S22 5G features a dynamic AMOLED display, advanced cameras, and 5G capabilitie...

3. Samsung Galaxy S21 Ultra | Factory Unlocked Android Cell Phone | US Version 5G Smartphone | Pro-Grade Camera, 8K Video, 108MP High Res | 128GB, Phantom Black (SM-G998UZKAXAA) (Renewed) - Score: 2.28
   Price: 348.47 USD
   Pro Grade Camera Zoom in close, take photos and videos like a pro, and capture incredible share-r...

