##### Vectorization Pipeline Components:
1. Chroma Integration
2. Semantic Document Preparation
3. Semantic Search Testing
4. Database Analytics

##### Usage Flow:
- Input: classification/results/output.csv from classification/loyalty_classifier_nb notebook
- Processing: Convert semantic classifications to embeddings
- Output: Searchable vector database for RAG queries

##### Test Queries Included:
- "loyalty points calculation rules"
- "order processing workflow"
- "customer data integration"
- "payment service integration"
- "business rule patterns"

In [2]:
classification_results = "results/classification"
classification_data = "{llm}/output.csv"
vector_output = "results/vectorization/{embedder}/loyalty_points_kb"
vector_collection = "loyalty_code_semantics_{llm}"
# "all-MiniLM-L6-v2",      # Fast, good general purpose
# "all-mpnet-base-v2",     # Better quality, slower
# "multi-qa-MiniLM-L6-cos-v1"  # Optimized for Q&A

def setup(llm: str, embedder: str):
    config = { "llm": llm, "embedder": embedder }

    classification_csv = f"{classification_results}/{classification_data.format(**config)}"
    db_path = vector_output.format(**config)
    collection_name = vector_collection.format(**config)

    return classification_csv, db_path, collection_name

In [3]:
from vectorization.document_utils import prepare_documents_for_embedding, load_classification_data
import json

from vectorization.semantic_vector_database import SemanticVectorDatabase

def run_vectorization_pipeline(db_path: str,
                                    classification_csv: str,
                                    collection_name: str,
                                    embedding_model: str,
                                    reset_db: bool = True):
    """Main pipeline to create vector database from classification results"""

    print("=== CodeSense Vector Database Creation ===")

    # Initialize database
    vector_db = SemanticVectorDatabase(db_path, embedding_model)

    # Create collection
    collection = vector_db.create_collection(collection_name, reset_db)

    # Prepare documents from classification data
    df = load_classification_data(classification_csv)
    documents = prepare_documents_for_embedding(df)

    # Add to collection
    collection.add_documents_to_collection(documents)

    # Get collection statistics
    stats = collection.get_collection_stats_v1()
    print(f"\n=== Collection Statistics ===")
    print(json.dumps(stats, indent=2, default=str))

    # Test semantic search
    test_queries = [
        "loyalty points calculation rules",
        "order processing workflow",
        "customer data integration",
        "payment service integration",
        "business rule patterns"
    ]

    print(f"\n=== Testing Semantic Search ===")
    for query in test_queries:
        collection.semantic_search(query, n_results=3)

In [7]:
print("\n=== Running Vectorization with Anthropic set and  all-MiniLM-L6-v2 ===")
classification_csv, db_path, collection_name = setup("claude4.0", "all-MiniLM-L6-v2")
run_vectorization_pipeline(db_path=db_path, classification_csv=classification_csv, collection_name=collection_name, embedding_model="all-MiniLM-L6-v2")


=== Running Vectorization with Anthropic set and  all-MiniLM-L6-v2 ===
=== CodeSense Vector Database Creation ===
Initialized Chroma database at: results\vectorization\all-MiniLM-L6-v2\loyalty_points_kb
Using embedding model: all-MiniLM-L6-v2
Deleted existing collection: loyalty_code_semantics_claude4.0
Collection 'loyalty_code_semantics_claude4.0' ready with 0 documents
Loaded 33 classification records from results/classification/claude4.0/output.csv
Prepared 33 documents for embedding
Added batch 1: 33/33 documents
Successfully added 33 documents to collection

=== Collection Statistics ===
{
  "total_documents": 33,
  "projects": {
    "PlantBasedPizza.LoyaltyPoints.Api.csproj": 6,
    "PlantBasedPizza.LoyaltyPoints.Internal.csproj": 7,
    "PlantBasedPizza.LoyaltyPoints.Shared.csproj": 13,
    "PlantBasedPizza.LoyaltyPoints.Worker.csproj": 7
  },
  "file_types": {
    "cs": 21,
    "appsettings": 12
  },
  "llms": {
    "Anthropic-claude-sonnet-4-0": 33
  },
  "technical_patterns"

In [9]:
print("\n=== Running Vectorization with Anthropic set and  all-mpnet-base-v2 ===")
classification_csv, db_path, collection_name = setup("claude4.0", "all-mpnet-base-v2")
run_vectorization_pipeline(db_path=db_path, classification_csv=classification_csv, collection_name=collection_name, embedding_model="all-mpnet-base-v2")


=== Running Vectorization with Anthropic set and  all-mpnet-base-v2 ===
=== CodeSense Vector Database Creation ===
Initialized Chroma database at: results\vectorization\all-mpnet-base-v2\loyalty_points_kb
Using embedding model: all-mpnet-base-v2
Collection 'loyalty_code_semantics_claude4.0' ready with 0 documents
Loaded 33 classification records from results/classification/claude4.0/output.csv
Prepared 33 documents for embedding
Added batch 1: 33/33 documents
Successfully added 33 documents to collection

=== Collection Statistics ===
{
  "total_documents": 33,
  "projects": {
    "PlantBasedPizza.LoyaltyPoints.Api.csproj": 6,
    "PlantBasedPizza.LoyaltyPoints.Internal.csproj": 7,
    "PlantBasedPizza.LoyaltyPoints.Shared.csproj": 13,
    "PlantBasedPizza.LoyaltyPoints.Worker.csproj": 7
  },
  "file_types": {
    "cs": 21,
    "appsettings": 12
  },
  "llms": {
    "Anthropic-claude-sonnet-4-0": 33
  },
  "technical_patterns": {
    "Cross-cutting concern implementation using extensi

In [None]:
print("\n=== Running Vectorization with Ollama set and  all-MiniLM-L6-v2 ===")
classification_csv, db_path, collection_name = setup("codellama", "all-MiniLM-L6-v2")
run_vectorization_pipeline(db_path=db_path, classification_csv=classification_csv, collection_name=collection_name, embedding_model="all-MiniLM-L6-v2")

In [None]:
print("\n=== Running Vectorization with Ollama set and  all-mpnet-base-v2 ===")
classification_csv, db_path, collection_name = setup("codellama", "all-mpnet-base-v2")
run_vectorization_pipeline(db_path=db_path, classification_csv=classification_csv, collection_name=collection_name, embedding_model="all-mpnet-base-v2")

In [None]:
print("\n=== Running Vectorization with OpenAI set and  all-MiniLM-L6-v2 ===")
classification_csv, db_path, collection_name = setup("gpt4.1", "all-MiniLM-L6-v2")
run_vectorization_pipeline(db_path=db_path, classification_csv=classification_csv, collection_name=collection_name, embedding_model="all-MiniLM-L6-v2")

In [None]:
print("\n=== Running Vectorization with OpenAI set and  all-mpnet-base-v2 ===")
classification_csv, db_path, collection_name = setup("gpt4.1", "all-mpnet-base-v2")
run_vectorization_pipeline(db_path=db_path, classification_csv=classification_csv, collection_name=collection_name, embedding_model="all-mpnet-base-v2")