# ChromaDB Vector Database Examples

This notebook demonstrates working with ChromaDB for storing, querying, and managing vector embeddings in a dedicated vector database optimized for similarity search operations.

In [None]:
// Import required packages for ChromaDB operations
#r "nuget: ChromaDB.Client, 1.0.0-preview.2"
#r "nuget: System.Text.Json, 9.0.0"
#r "nuget: Microsoft.Extensions.Http, 9.0.0"

using ChromaDB.Client;
using System.Text.Json;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Http;

// Configure ChromaDB client
var (host, port) = NotebookConfiguration.Chroma.GetApiUrl();
var chromaClient = new ChromaClient($"http://{host}:{port}");

Console.WriteLine($"üîó Connected to ChromaDB at: http://{host}:{port}");
Console.WriteLine($"üìä ChromaDB Version: {await chromaClient.GetVersionAsync()}");

## Collection Management

Create and manage collections for organizing different types of documents and embeddings.

In [None]:
// Create collections for different document types
var techDocsCollection = await chromaClient.CreateCollectionAsync(
    name: "tech_documents", 
    metadata: new Dictionary<string, object>
    {
        ["description"] = "Technical documentation and articles",
        ["embedding_model"] = "sentence-transformers/all-MiniLM-L6-v2",
        ["created_date"] = DateTime.UtcNow.ToString("O")
    });

var businessDocsCollection = await chromaClient.CreateCollectionAsync(
    name: "business_documents",
    metadata: new Dictionary<string, object>
    {
        ["description"] = "Business processes and policy documents", 
        ["embedding_model"] = "sentence-transformers/all-MiniLM-L6-v2",
        ["created_date"] = DateTime.UtcNow.ToString("O")
    });

Console.WriteLine("‚úÖ Created collections:");
Console.WriteLine($"  - {techDocsCollection.Name} (ID: {techDocsCollection.Id})");
Console.WriteLine($"  - {businessDocsCollection.Name} (ID: {businessDocsCollection.Id})");

// List all collections
var collections = await chromaClient.ListCollectionsAsync();
Console.WriteLine($"\nüìö Total collections: {collections.Count()}");
foreach (var collection in collections)
{
    Console.WriteLine($"  - {collection.Name}: {collection.Metadata?.GetValueOrDefault("description", "No description")}");
}

## Document Storage

Add documents with embeddings and metadata to collections.

In [None]:
// Helper method to generate embeddings (simplified for demo)
float[] GenerateEmbedding(string text)
{
    var random = new Random(text.GetHashCode());
    var embedding = Enumerable.Range(0, 384)
        .Select(_ => (float)(random.NextDouble() - 0.5))
        .ToArray();
    
    // Normalize
    var magnitude = Math.Sqrt(embedding.Sum(x => x * x));
    for (int i = 0; i < embedding.Length; i++)
    {
        embedding[i] /= (float)magnitude;
    }
    
    return embedding;
}

// Technical documents
var techDocuments = new[]
{
    new { 
        id = "tech_001", 
        content = "Machine learning algorithms require careful feature engineering and model selection.",
        category = "ML",
        difficulty = "Advanced",
        tags = new[] { "machine-learning", "algorithms", "features" }
    },
    new { 
        id = "tech_002", 
        content = "Database indexing strategies can significantly improve query performance in large datasets.",
        category = "Database",
        difficulty = "Intermediate", 
        tags = new[] { "database", "performance", "indexing" }
    },
    new { 
        id = "tech_003", 
        content = "Microservices architecture enables scalable and maintainable distributed systems.",
        category = "Architecture",
        difficulty = "Advanced",
        tags = new[] { "microservices", "scalability", "architecture" }
    }
};

// Add technical documents
var techIds = techDocuments.Select(d => d.id).ToList();
var techTexts = techDocuments.Select(d => d.content).ToList();
var techEmbeddings = techDocuments.Select(d => GenerateEmbedding(d.content).ToList()).ToList();
var techMetadata = techDocuments.Select(d => new Dictionary<string, object>
{
    ["category"] = d.category,
    ["difficulty"] = d.difficulty,
    ["tags"] = JsonSerializer.Serialize(d.tags),
    ["word_count"] = d.content.Split(' ').Length,
    ["added_date"] = DateTime.UtcNow.ToString("O")
}).ToList();

await techDocsCollection.AddAsync(
    ids: techIds,
    documents: techTexts,
    embeddings: techEmbeddings,
    metadatas: techMetadata);

Console.WriteLine($"‚úÖ Added {techDocuments.Length} technical documents");

// Business documents
var businessDocuments = new[]
{
    new { 
        id = "biz_001", 
        content = "Employee onboarding process includes documentation review and system access setup.",
        department = "HR",
        priority = "High",
        tags = new[] { "onboarding", "hr", "process" }
    },
    new { 
        id = "biz_002", 
        content = "Financial reporting requirements must comply with accounting standards and regulations.",
        department = "Finance", 
        priority = "Critical",
        tags = new[] { "finance", "reporting", "compliance" }
    },
    new { 
        id = "biz_003", 
        content = "Customer support escalation procedures ensure timely resolution of complex issues.",
        department = "Support",
        priority = "Medium",
        tags = new[] { "support", "escalation", "customer-service" }
    }
};

// Add business documents
var bizIds = businessDocuments.Select(d => d.id).ToList();
var bizTexts = businessDocuments.Select(d => d.content).ToList();
var bizEmbeddings = businessDocuments.Select(d => GenerateEmbedding(d.content).ToList()).ToList();
var bizMetadata = businessDocuments.Select(d => new Dictionary<string, object>
{
    ["department"] = d.department,
    ["priority"] = d.priority,
    ["tags"] = JsonSerializer.Serialize(d.tags),
    ["word_count"] = d.content.Split(' ').Length,
    ["added_date"] = DateTime.UtcNow.ToString("O")
}).ToList();

await businessDocsCollection.AddAsync(
    ids: bizIds,
    documents: bizTexts,
    embeddings: bizEmbeddings,
    metadatas: bizMetadata);

Console.WriteLine($"‚úÖ Added {businessDocuments.Length} business documents");

## Similarity Search

Perform vector similarity searches with metadata filtering.

In [None]:
// Semantic search in technical documents
var techQuery = "performance optimization techniques";
var techQueryEmbedding = GenerateEmbedding(techQuery);

var techResults = await techDocsCollection.QueryAsync(
    queryEmbeddings: new[] { techQueryEmbedding.ToList() },
    nResults: 5,
    include: new[] { "documents", "metadatas", "distances" });

Console.WriteLine($"üîç Technical search results for: '{techQuery}'\n");
for (int i = 0; i < techResults.Documents[0].Count; i++)
{
    var doc = techResults.Documents[0][i];
    var metadata = techResults.Metadatas[0][i];
    var distance = techResults.Distances?[0][i] ?? 0;
    var similarity = 1 - distance;
    
    Console.WriteLine($"Score: {similarity:F3} | Category: {metadata["category"]}");
    Console.WriteLine($"  {doc}");
    Console.WriteLine($"  Tags: {metadata["tags"]}");
    Console.WriteLine();
}

## Filtered Search

Combine similarity search with metadata filtering for precise results.

In [None]:
// Search with metadata filters
var businessQuery = "employee procedures";
var businessQueryEmbedding = GenerateEmbedding(businessQuery);

// Filter for high priority HR documents
var filteredResults = await businessDocsCollection.QueryAsync(
    queryEmbeddings: new[] { businessQueryEmbedding.ToList() },
    nResults: 10,
    where: new Dictionary<string, object>
    {
        ["department"] = "HR",
        ["priority"] = new Dictionary<string, object>
        {
            ["$in"] = new[] { "High", "Critical" }
        }
    },
    include: new[] { "documents", "metadatas", "distances" });

Console.WriteLine($"üéØ Filtered search results for: '{businessQuery}' (HR, High/Critical priority)\n");
if (filteredResults.Documents[0].Any())
{
    for (int i = 0; i < filteredResults.Documents[0].Count; i++)
    {
        var doc = filteredResults.Documents[0][i];
        var metadata = filteredResults.Metadatas[0][i];
        var distance = filteredResults.Distances?[0][i] ?? 0;
        var similarity = 1 - distance;
        
        Console.WriteLine($"Score: {similarity:F3} | Dept: {metadata["department"]} | Priority: {metadata["priority"]}");
        Console.WriteLine($"  {doc}");
        Console.WriteLine();
    }
}
else
{
    Console.WriteLine("No matching documents found with the specified filters.");
}

// Complex filter example - technical documents with specific categories
var advancedTechResults = await techDocsCollection.QueryAsync(
    queryEmbeddings: new[] { GenerateEmbedding("distributed systems design").ToList() },
    nResults: 5,
    where: new Dictionary<string, object>
    {
        ["$or"] = new[]
        {
            new Dictionary<string, object> { ["category"] = "Architecture" },
            new Dictionary<string, object> { ["category"] = "Database" }
        }
    },
    include: new[] { "documents", "metadatas", "distances" });

Console.WriteLine("üîß Advanced filtered search (Architecture OR Database):");
for (int i = 0; i < advancedTechResults.Documents[0].Count; i++)
{
    var doc = advancedTechResults.Documents[0][i];
    var metadata = advancedTechResults.Metadatas[0][i];
    var similarity = 1 - (advancedTechResults.Distances?[0][i] ?? 0);
    
    Console.WriteLine($"  {similarity:F3} | {metadata["category"]} | {doc.Substring(0, Math.Min(50, doc.Length))}...");
}

## Collection Analytics

Analyze collection statistics and document patterns.

In [None]:
// Get collection statistics
var techCount = await techDocsCollection.CountAsync();
var businessCount = await businessDocsCollection.CountAsync();

Console.WriteLine("üìä Collection Statistics:\n");
Console.WriteLine($"Technical Documents: {techCount} documents");
Console.WriteLine($"Business Documents: {businessCount} documents");
Console.WriteLine($"Total Documents: {techCount + businessCount}");

// Analyze metadata patterns
var allTechDocs = await techDocsCollection.GetAsync(
    include: new[] { "documents", "metadatas" });

Console.WriteLine("\nüè∑Ô∏è Technical Document Categories:");
var techCategories = allTechDocs.Metadatas
    .GroupBy(m => m["category"].ToString())
    .OrderByDescending(g => g.Count());

foreach (var category in techCategories)
{
    Console.WriteLine($"  {category.Key}: {category.Count()} documents");
}

Console.WriteLine("\nüìà Difficulty Levels:");
var difficultyLevels = allTechDocs.Metadatas
    .GroupBy(m => m["difficulty"].ToString())
    .OrderByDescending(g => g.Count());

foreach (var level in difficultyLevels)
{
    Console.WriteLine($"  {level.Key}: {level.Count()} documents");
}

// Business document analysis
var allBizDocs = await businessDocsCollection.GetAsync(
    include: new[] { "documents", "metadatas" });

Console.WriteLine("\nüè¢ Business Document Departments:");
var departments = allBizDocs.Metadatas
    .GroupBy(m => m["department"].ToString())
    .OrderByDescending(g => g.Count());

foreach (var dept in departments)
{
    Console.WriteLine($"  {dept.Key}: {dept.Count()} documents");
}

Console.WriteLine("\n‚ö° Priority Levels:");
var priorities = allBizDocs.Metadatas
    .GroupBy(m => m["priority"].ToString())
    .OrderBy(g => g.Key switch 
    { 
        "Critical" => 0, 
        "High" => 1, 
        "Medium" => 2, 
        "Low" => 3, 
        _ => 4 
    });

foreach (var priority in priorities)
{
    Console.WriteLine($"  {priority.Key}: {priority.Count()} documents");
}

## Document Management

Update, delete, and manage documents in collections.

In [None]:
// Update document metadata
await techDocsCollection.UpdateAsync(
    ids: new[] { "tech_002" },
    metadatas: new[] { new Dictionary<string, object>
    {
        ["category"] = "Database",
        ["difficulty"] = "Advanced", // Updated from Intermediate
        ["tags"] = JsonSerializer.Serialize(new[] { "database", "performance", "indexing", "optimization" }),
        ["word_count"] = 12,
        ["updated_date"] = DateTime.UtcNow.ToString("O"),
        ["version"] = "2.0"
    }});

Console.WriteLine("‚úÖ Updated tech_002 document metadata");

// Retrieve updated document to verify changes
var updatedDoc = await techDocsCollection.GetAsync(
    ids: new[] { "tech_002" },
    include: new[] { "documents", "metadatas" });

if (updatedDoc.Metadatas.Any())
{
    var metadata = updatedDoc.Metadatas.First();
    Console.WriteLine("\nüìù Updated document metadata:");
    Console.WriteLine($"  Category: {metadata["category"]}");
    Console.WriteLine($"  Difficulty: {metadata["difficulty"]}");
    Console.WriteLine($"  Version: {metadata.GetValueOrDefault("version", "N/A")}");
    Console.WriteLine($"  Updated: {metadata.GetValueOrDefault("updated_date", "N/A")}");
}

// Add a new document to existing collection
await techDocsCollection.AddAsync(
    ids: new[] { "tech_004" },
    documents: new[] { "Container orchestration platforms like Kubernetes enable automated deployment and scaling." },
    embeddings: new[] { GenerateEmbedding("Container orchestration platforms like Kubernetes enable automated deployment and scaling.").ToList() },
    metadatas: new[] { new Dictionary<string, object>
    {
        ["category"] = "DevOps",
        ["difficulty"] = "Advanced",
        ["tags"] = JsonSerializer.Serialize(new[] { "kubernetes", "containers", "orchestration", "devops" }),
        ["word_count"] = 11,
        ["added_date"] = DateTime.UtcNow.ToString("O")
    }});

Console.WriteLine("‚úÖ Added new DevOps document to technical collection");

// Verify the new document count
var newTechCount = await techDocsCollection.CountAsync();
Console.WriteLine($"üìä Updated technical document count: {newTechCount}");

## Cross-Collection Search

Search across multiple collections for comprehensive results.

In [None]:
// Perform searches across both collections
var searchQuery = "process optimization";
var queryEmbedding = GenerateEmbedding(searchQuery);

// Search technical collection
var techSearchResults = await techDocsCollection.QueryAsync(
    queryEmbeddings: new[] { queryEmbedding.ToList() },
    nResults: 3,
    include: new[] { "documents", "metadatas", "distances" });

// Search business collection  
var bizSearchResults = await businessDocsCollection.QueryAsync(
    queryEmbeddings: new[] { queryEmbedding.ToList() },
    nResults: 3,
    include: new[] { "documents", "metadatas", "distances" });

Console.WriteLine($"üîç Cross-collection search for: '{searchQuery}'\n");

// Combine and rank results
var allResults = new List<(string source, string doc, Dictionary<string, object> metadata, double similarity)>();

// Add technical results
for (int i = 0; i < techSearchResults.Documents[0].Count; i++)
{
    var similarity = 1 - (techSearchResults.Distances?[0][i] ?? 0);
    allResults.Add(("Technical", techSearchResults.Documents[0][i], techSearchResults.Metadatas[0][i], similarity));
}

// Add business results
for (int i = 0; i < bizSearchResults.Documents[0].Count; i++)
{
    var similarity = 1 - (bizSearchResults.Distances?[0][i] ?? 0);
    allResults.Add(("Business", bizSearchResults.Documents[0][i], bizSearchResults.Metadatas[0][i], similarity));
}

// Sort by similarity and display top results
var topResults = allResults.OrderByDescending(r => r.similarity).Take(5);

Console.WriteLine("üèÜ Top results across all collections:");
foreach (var result in topResults)
{
    Console.WriteLine($"Score: {result.similarity:F3} | Source: {result.source}");
    Console.WriteLine($"  {result.doc}");
    
    if (result.source == "Technical")
    {
        Console.WriteLine($"  Category: {result.metadata["category"]} | Difficulty: {result.metadata["difficulty"]}");
    }
    else
    {
        Console.WriteLine($"  Department: {result.metadata["department"]} | Priority: {result.metadata["priority"]}");
    }
    Console.WriteLine();
}

## Cleanup & Summary

Clean up collections and summarize the ChromaDB workflow.

In [None]:
// Display final summary
Console.WriteLine("üéâ ChromaDB Vector Database Workflow Complete!\n");
Console.WriteLine("What we accomplished:");
Console.WriteLine("‚úÖ Created and managed multiple document collections");
Console.WriteLine("‚úÖ Stored documents with embeddings and rich metadata");
Console.WriteLine("‚úÖ Performed similarity searches with vector embeddings");
Console.WriteLine("‚úÖ Applied metadata filtering for precise results");
Console.WriteLine("‚úÖ Analyzed collection statistics and patterns");
Console.WriteLine("‚úÖ Updated and managed documents dynamically");
Console.WriteLine("‚úÖ Executed cross-collection searches for comprehensive results");
Console.WriteLine();
Console.WriteLine("üîó Key ChromaDB Features Used:");
Console.WriteLine("  - Vector similarity search with cosine distance");
Console.WriteLine("  - Metadata filtering with complex query operators");
Console.WriteLine("  - Document and embedding management");
Console.WriteLine("  - Collection organization and analytics");
Console.WriteLine("  - Real-time updates and modifications");

// Final collection statistics
var finalTechCount = await techDocsCollection.CountAsync();
var finalBizCount = await businessDocsCollection.CountAsync();

Console.WriteLine($"\nüìä Final Statistics:");
Console.WriteLine($"  Technical Documents: {finalTechCount}");
Console.WriteLine($"  Business Documents: {finalBizCount}");
Console.WriteLine($"  Total Documents: {finalTechCount + finalBizCount}");

// Optionally clean up collections
Console.WriteLine("\nüßπ To clean up collections, uncomment and run:");
Console.WriteLine("// await chromaClient.DeleteCollectionAsync(\"tech_documents\");");
Console.WriteLine("// await chromaClient.DeleteCollectionAsync(\"business_documents\");");