<a href="https://colab.research.google.com/github/vijaysrajan/buysellconnect/blob/main/milvusDB_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Simple Milvus Setup for Marketplace
# Run this in Google Colab

# Step 1: Install packages
!pip install pymilvus sentence-transformers


Collecting pymilvus
  Downloading pymilvus-2.6.1-py3-none-any.whl.metadata (6.5 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (9.4 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Downloading pymilvus-2.6.1-py3-none-any.whl (254 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.3/254.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl (55.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ujson-5.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson, milvus-

In [2]:
# Step 2: Import and setup
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
from sentence_transformers import SentenceTransformer
import numpy as np


In [3]:
# Step 3: Connect to Milvus (using Milvus Lite for Colab)
!pip install milvus[lite]
from milvus import default_server

default_server.start()
connections.connect("default", host="127.0.0.1", port=default_server.listen_port)
print("✅ Connected to Milvus")


Collecting milvus[lite]
  Downloading milvus-2.3.5-py3-none-manylinux2014_x86_64.whl.metadata (6.7 kB)
[0mDownloading milvus-2.3.5-py3-none-manylinux2014_x86_64.whl (57.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: milvus
Successfully installed milvus-2.3.5
✅ Connected to Milvus


In [5]:
# Step 4: Define schema
def create_schema():
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="role", dtype=DataType.VARCHAR, max_length=20),  # buyer, seller, etc.
        FieldSchema(name="category", dtype=DataType.VARCHAR, max_length=50),  # antiques, appliances, etc.
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=1000),
        FieldSchema(name="price", dtype=DataType.FLOAT),
        FieldSchema(name="location", dtype=DataType.VARCHAR, max_length=200),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384)
    ]
    return CollectionSchema(fields=fields, description="Marketplace listings")


In [6]:
# Step 5: Create collection
collection_name = "marketplace"
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

schema = create_schema()
collection = Collection(name=collection_name, schema=schema)
print("✅ Collection created")



✅ Collection created


In [7]:
# Step 6: Create partitions for your categories
categories = [
    "antiques", "appliances", "arts_crafts", "atvs_utvs_snowmobiles",
    "auto_parts", "baby_kids_items", "barter", "beauty_health",
    "bicycles", "boats", "books", "business", "cars_trucks",
    "cds_dvds_vhs", "cell_phones_mobiles", "clothing_accessories",
    "collectibles", "computers", "other_electronics", "farm_garden",
    "free", "furniture", "garage_sale", "general", "heavy_equipment",
    "household_items", "jewelry", "materials", "motorcycles",
    "music_instruments", "photo_video", "rvs_camp", "sporting",
    "tickets", "tools", "toys_games", "trailers", "video_gaming",
    "wanted", "tv_televisions"
]

for category in categories:
    if (collection.has_partition(category)):
        # If the partition already exists, you might want to handle this
        # case differently, e.g., skip creation or clear entities.
        # For this example, we'll just proceed to create if it doesn't exist.
        continue # No need to delete the partition itself

    collection.create_partition(category)

print(f"✅ Created {len(categories)} partitions")

✅ Created 40 partitions


In [8]:
# Step 7: Create indexes and load
# Vector index for similarity search
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}
collection.create_index(field_name="embedding", index_params=index_params)

# Scalar index for fast category filtering
collection.create_index(field_name="category")
collection.create_index(field_name="role")

collection.load()


In [9]:
# Step 8: Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Setup complete!")

# =========================================
# BASIC OPERATIONS
# =========================================

# INSERT DATA
def insert_listing(role, category, title, description, price, location):
    # Generate embedding
    text = f"{title} {description}"
    embedding = model.encode(text).tolist()

    # Prepare data
    data = [
        [role],
        [category],
        [title],
        [description],
        [price],
        [location],
        [embedding]
    ]

    # Insert into specific partition
    result = collection.insert(data, partition_name=category)
    collection.flush()
    return result.primary_keys[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Setup complete!


In [11]:
# QUERY DATA
def search_listings(query_text, category=None, top_k=5, require_keywords=False):
    # Generate query embedding
    query_embedding = model.encode(query_text).tolist()

    # Search parameters
    search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}

    # Search in specific partition or all
    partition_names = [category] if category else None

    # Add keyword filtering if required
    keyword_filter = None
    if require_keywords:
        keywords = query_text.split()
        # Construct the boolean expression for keyword filtering
        # Using prefix matching (keyword%) as leading wildcards are not supported for VARCHAR in Milvus
        keyword_filter = " or ".join([f"title like '{keyword}%' or description like '{keyword}%'" for keyword in keywords])


    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        partition_names=partition_names,
        output_fields=["role", "category", "title", "description", "price", "location"],
        expr=keyword_filter # Add the keyword filter expression here
    )

    return results[0]

In [25]:
# This function is no longer needed as its logic is now in search_listings
# def search_exact_keywords(query_text, role=None, category=None, top_k=5):
#     return search_listings(
#         query_text, role, category, top_k,
#         similarity_threshold=0.2,     # Lower threshold
#         require_keywords=True         # This is the key parameter!
#     )

In [12]:
# UPDATE DATA (delete + insert)
def update_listing(listing_id, **updates):
    #get the existing listing
    existing_records = collection.query(expr=f"id == {listing_id}", output_fields=[...])
    print (existing_records)
    # Delete old record
    collection.delete(f"id == {listing_id}")
    collection.flush()

    # Note: For real update, you'd need to get existing data first
    # This is a simplified example
    print(f"Listing {listing_id} marked for update")


In [14]:

# DELETE DATA
def delete_listing(listing_id):
    collection.delete(f"id == {listing_id}")
    collection.flush()
    print(f"Listing {listing_id} deleted")



In [22]:
def list_all_ids(collection, limit=1000):
  """Lists entity IDs in a Milvus collection with a specified limit."""
  # Query the collection, asking for the 'id' field for all entities
  # An empty expression "" means query all entities, but requires a limit
  results = collection.query(expr="", output_fields=["id"], limit=limit)
  # Extract the 'id' from each result entity
  ids = [entity['id'] for entity in results]
  return ids


In [23]:

#Clean the DB
# Example usage:
# You might need to adjust the limit based on the number of entities you have
all_listing_ids = list_all_ids(collection, limit=100) # Setting a limit of 100 for demonstration
print(f"First {len(all_listing_ids)} IDs in the collection:")
print(all_listing_ids)
for id in all_listing_ids:
  delete_listing(id)


# EXAMPLES
print("\n🎯 EXAMPLES:")


# Example 1: Insert some data
print("\n1. Inserting data...")
id1 = insert_listing("seller", "furniture", "Vintage Oak Table", "Beautiful dining table", 450.0, "San Francisco")
id2 = insert_listing("buyer", "computers", "Need Gaming Laptop", "Looking for RTX 3060+", 1500.0, "Austin")
id3 = insert_listing("seller", "bicycles", "Mountain Bike", "Trek 29er excellent condition", 850.0, "Denver")

print(f"Inserted 3 items with IDs: {id1}, {id2}, {id3}")


hindi_sentence_buyer_seeks_table = "मैं लकड़ी की मेज़ खरीदने की तलाश में हूँ।"
wooden_table = "wooden furniture table"

# Example 2: Search
print("\n2. Searching...")
results = search_listings( hindi_sentence_buyer_seeks_table,
                           # wooden_table
                           category="furniture")
for hit in results:
    entity = hit.entity
    print(f"Found: {entity.get('title')} - ${entity.get('price')} (score: {hit.score:.3f})")

# Example 3: Search across all categories
print("\n3. General search...")
results = search_listings("laptop computer gaming")
for hit in results:
    entity = hit.entity
    print(f"Found: {entity.get('title')} in {entity.get('category')} - ${entity.get('price')}")

# # Example 4: Get collection stats
# print(f"\n4. Stats: {collection.num_entities} total items in {len(collection.partitions)} partitions")

#example 5: ask for something not present
chair = "wooden chairs"
synonyms = ['seat', 'stool', 'bench' ,'armchair', 'recliner', 'sofa', 'couch','pew', 'settee', 'rocker', 'chaise longue', 'canapé', 'throne', 'lounge', 'ottoman', 'wing chair']
results = search_listings("wooden chairs", category="furniture", require_keywords=True)
for hit in results:
    entity = hit.entity
    print(f"Found: {entity.get('title')} - ${entity.get('price')} (score: {hit.score:.3f})")

for synonym in synonyms:
    partial_result = search_listings(synonym, category="furniture", require_keywords=True)
    print(f"Found: {entity.get('title')} - ${entity.get('price')} (score: {hit.score:.3f})")
    if len(partial_result) > 0:
        results.append(partial_result)


if len(results) == 0:
    print("No results found for " + chair)



print("\n✅ All examples completed!")

First 9 IDs in the collection:
[460670363209040724, 460670363209040726, 460670363209040728, 460670363209040730, 460670363209040732, 460670363209040734, 460670363209040736, 460670363209040738, 460670363209040740]
Listing 460670363209040724 deleted
Listing 460670363209040726 deleted
Listing 460670363209040728 deleted
Listing 460670363209040730 deleted
Listing 460670363209040732 deleted
Listing 460670363209040734 deleted
Listing 460670363209040736 deleted
Listing 460670363209040738 deleted
Listing 460670363209040740 deleted

🎯 EXAMPLES:

1. Inserting data...
Inserted 3 items with IDs: 460670363209040751, 460670363209040753, 460670363209040755

2. Searching...
Found: Vintage Oak Table - $450.0 (score: 0.058)

3. General search...
Found: Need Gaming Laptop in computers - $1500.0
Found: Vintage Oak Table in furniture - $450.0
Found: Mountain Bike in bicycles - $850.0
Found: Mountain Bike - $850.0 (score: 0.073)
Found: Mountain Bike - $850.0 (score: 0.073)
Found: Mountain Bike - $850.0 (score

First 6 IDs in the collection:
[460670363209040706, 460670363209040708, 460670363209040710, 460670363209040712, 460670363209040714, 460670363209040716]
Listing 460670363209040706 deleted
Listing 460670363209040708 deleted
Listing 460670363209040710 deleted
Listing 460670363209040712 deleted
Listing 460670363209040714 deleted
Listing 460670363209040716 deleted
