In [1]:
import chromadb
from chromadb.config import Settings

In [2]:
client = chromadb.PersistentClient(path="./chroma_db")

In [3]:
# Step 1: Create or get a collection
collection_name = "demo_documents"
collection = client.get_or_create_collection(
    name=collection_name,
    metadata={"description": "A collection for demo documents"}
)

In [4]:
# Step 2: Add documents with embeddings
# ChromaDB can auto-generate embeddings or you can provide your own
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Python is a versatile programming language used for web development, data science, and AI.",
    "Machine learning models require large amounts of data for training.",
    "Vector databases store high-dimensional vectors for similarity search.",
    "Natural language processing helps computers understand human language."
]


In [5]:
# Document IDs and metadata
ids = [f"doc_{i}" for i in range(len(documents))]
metadatas = [
    {"category": "example", "type": "sentence"},
    {"category": "programming", "type": "description"},
    {"category": "ml", "type": "fact"},
    {"category": "database", "type": "definition"},
    {"category": "nlp", "type": "description"}
]

# Add documents to collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [6]:
# Step 3: Query the collection

# Query 1: Find documents similar to a programming query
query1 = "coding and software development"
results1 = collection.query(
    query_texts=[query1],
    n_results=2
)

print(f"\nQuery: '{query1}'")
print("Results:")
for i, (doc, distance) in enumerate(zip(results1['documents'][0], results1['distances'][0])):
    print(f"  {i+1}. {doc} (distance: {distance:.3f})")


Query: 'coding and software development'
Results:
  1. Python is a versatile programming language used for web development, data science, and AI. (distance: 1.399)
  2. Natural language processing helps computers understand human language. (distance: 1.523)


In [7]:
# Query 2: Find documents about AI/ML
query2 = "artificial intelligence and data"
results2 = collection.query(
    query_texts=[query2],
    n_results=2
)

print(f"\nQuery: '{query2}'")
print("Results:")
for i, (doc, distance) in enumerate(zip(results2['documents'][0], results2['distances'][0])):
    print(f"  {i+1}. {doc} (distance: {distance:.3f})")


Query: 'artificial intelligence and data'
Results:
  1. Machine learning models require large amounts of data for training. (distance: 1.180)
  2. Natural language processing helps computers understand human language. (distance: 1.310)


In [8]:
# Step 4: Filter by metadata
print("\nüè∑Ô∏è  Filtering by metadata...")
results_filtered = collection.query(
    query_texts=["programming"],
    n_results=5,
    where={"category": "programming"}
)

print("Documents in 'programming' category:")
for doc in results_filtered['documents'][0]:
    print(f"  - {doc}")


üè∑Ô∏è  Filtering by metadata...
Documents in 'programming' category:
  - Python is a versatile programming language used for web development, data science, and AI.


In [9]:
# Step 5: Get collection info
print(f"\nüìä Collection Statistics:")
print(f"  Name: {collection.name}")
print(f"  Count: {collection.count()}")
print(f"  Metadata: {collection.metadata}")


üìä Collection Statistics:
  Name: demo_documents
  Count: 5
  Metadata: {'description': 'A collection for demo documents'}


In [10]:
# Step 6: Peek at stored data
print(f"\nüëÄ Peek at stored data:")
peek_results = collection.peek(limit=3)
for i, (id, doc, metadata) in enumerate(zip(
    peek_results['ids'], 
    peek_results['documents'], 
    peek_results['metadatas']
)):
    print(f"  {i+1}. ID: {id}")
    print(f"     Doc: {doc}")
    print(f"     Metadata: {metadata}")
    print()


üëÄ Peek at stored data:
  1. ID: doc_0
     Doc: The quick brown fox jumps over the lazy dog. This is an updated version.
     Metadata: {'type': 'sentence', 'category': 'example', 'updated': True}

  2. ID: doc_1
     Doc: Python is a versatile programming language used for web development, data science, and AI.
     Metadata: {'type': 'description', 'category': 'programming'}

  3. ID: doc_2
     Doc: Machine learning models require large amounts of data for training.
     Metadata: {'type': 'fact', 'category': 'ml'}



In [11]:
# Step 7: Update a document
print("üîÑ Updating a document...")
collection.update(
    ids=["doc_0"],
    documents=["The quick brown fox jumps over the lazy dog. This is an updated version."],
    metadatas=[{"category": "example", "type": "sentence", "updated": True}]
)

# Verify the update
updated_doc = collection.get(ids=["doc_0"])
print(f"Updated document: {updated_doc['documents'][0]}")

üîÑ Updating a document...
Updated document: The quick brown fox jumps over the lazy dog. This is an updated version.


In [12]:
# Step 8: Delete a document
print("\nüóëÔ∏è  Deleting a document...")
collection.delete(ids=["doc_4"])
print(f"Collection count after deletion: {collection.count()}")

print("\n" + "=" * 50)
print("‚ú® ChromaDB Demo Complete!")
print("\nüí° Key Features Demonstrated:")
print("  ‚Ä¢ Document storage with auto-embeddings")
print("  ‚Ä¢ Semantic similarity search")
print("  ‚Ä¢ Metadata filtering")
print("  ‚Ä¢ CRUD operations (Create, Read, Update, Delete)")
print("  ‚Ä¢ Collection management")


üóëÔ∏è  Deleting a document...
Collection count after deletion: 4

‚ú® ChromaDB Demo Complete!

üí° Key Features Demonstrated:
  ‚Ä¢ Document storage with auto-embeddings
  ‚Ä¢ Semantic similarity search
  ‚Ä¢ Metadata filtering
  ‚Ä¢ CRUD operations (Create, Read, Update, Delete)
  ‚Ä¢ Collection management
