In [3]:

import chromadb
from chromadb.utils import embedding_functions
from pathlib import Path

# Configuration
CHROMA_PATH = "./enhanced_embeddings"
COLLECTION_NAME = "research_paper_sections"
MODEL_NAME = "all-MiniLM-L6-v2"


def retrieve_all_chunks():
    """Retrieve and print all chunks with their metadata from ChromaDB."""
    
    print(f"Connecting to ChromaDB at: {CHROMA_PATH}")
    print("=" * 80)
    
    # Check if path exists
    if not Path(CHROMA_PATH).exists():
        print(f"‚ùå Error: ChromaDB path does not exist: {CHROMA_PATH}")
        return
    
    # Initialize client
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    
    # Initialize embedding function
    try:
        embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name=MODEL_NAME
        )
    except Exception as e:
        print(f"Warning: Could not initialize SentenceTransformer: {e}")
        embed_func = None
    
    # Get the collection
    try:
        collection = client.get_collection(
            name=COLLECTION_NAME,
            embedding_function=embed_func
        )
    except Exception as e:
        print(f"‚ùå Error: Could not get collection '{COLLECTION_NAME}': {e}")
        print("\nAvailable collections:")
        for col in client.list_collections():
            print(f"  - {col.name}")
        return
    
    print(f"‚úÖ Successfully connected to collection: '{COLLECTION_NAME}'")
    print(f"üìä Total documents in collection: {collection.count()}")
    print("=" * 80)
    
    # Get all data from the collection
    all_data = collection.get(
        include=['documents', 'metadatas', 'embeddings']
    )
    
    print(f"\nüìã Retrieved {len(all_data['ids'])} chunks\n")
    
    # Group by paper_id for better organization
    papers = {}
    for i, chunk_id in enumerate(all_data['ids']):
        metadata = all_data['metadatas'][i]
        document = all_data['documents'][i]
        
        paper_id = metadata.get('paper_id', 'Unknown')
        
        if paper_id not in papers:
            papers[paper_id] = []
        
        papers[paper_id].append({
            'id': chunk_id,
            'metadata': metadata,
            'document': document
        })
    
    # Print organized results
    for paper_id, chunks in papers.items():
        print("=" * 80)
        print(f"üìÑ PAPER: {paper_id}")
        print(f"   Total Chunks: {len(chunks)}")
        print("=" * 80)
        
        # Group by section
        sections = {}
        for chunk in chunks:
            section = chunk['metadata'].get('section_category', 'Unknown')
            if section not in sections:
                sections[section] = []
            sections[section].append(chunk)
        
        for section_name, section_chunks in sections.items():
            print(f"\n  üìë Section: {section_name}")
            print(f"     Chunks in this section: {len(section_chunks)}")
            print("     " + "-" * 70)
            
            for idx, chunk in enumerate(section_chunks, 1):
                metadata = chunk['metadata']
                doc_text = chunk['document']
                
                print(f"\n     Chunk {idx}:")
                print(f"       ID: {chunk['id']}")
                print(f"       Metadata: {metadata}")
                print(f"       Page Number: {metadata.get('page_number', 'N/A')}")
                print(f"       Chunk Number: {metadata.get('chunk_number', 'N/A')}")
                print(f"       Text Preview: {doc_text[:150]}...")
                print(f"       Text Length: {len(doc_text)} characters")
        
        print("\n")
    
    # Print summary statistics
    print("=" * 80)
    print("üìä SUMMARY STATISTICS")
    print("=" * 80)
    print(f"Total Papers: {len(papers)}")
    
    for paper_id, chunks in papers.items():
        sections = {}
        for chunk in chunks:
            section = chunk['metadata'].get('section_category', 'Unknown')
            sections[section] = sections.get(section, 0) + 1
        
        print(f"\n{paper_id}:")
        for section, count in sorted(sections.items()):
            print(f"  - {section}: {count} chunks")


def search_specific_section(paper_id: str, section_name: str):
    """Search for a specific paper and section."""
    
    print(f"\nüîç Searching for: Paper={paper_id}, Section={section_name}")
    print("=" * 80)
    
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    
    try:
        embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name=MODEL_NAME
        )
    except:
        embed_func = None
    
    collection = client.get_collection(
        name=COLLECTION_NAME,
        embedding_function=embed_func
    )
    
    # Query with filters
    results = collection.get(
        where={
            "$and": [
                {"paper_id": paper_id},
                {"section_category": section_name}
            ]
        },
        include=['documents', 'metadatas']
    )
    
    print(f"‚úÖ Found {len(results['ids'])} chunks")
    print("=" * 80)
    
    for i, chunk_id in enumerate(results['ids'], 1):
        metadata = results['metadatas'][i-1]
        document = results['documents'][i-1]
        
        print(f"\nChunk {i}:")
        print(f"  ID: {chunk_id}")
        print(f"  Metadata: {metadata}")
        print(f"  Text Preview: {document[:200]}...")
        print("-" * 80)


retrieve_all_chunks()


Connecting to ChromaDB at: ./enhanced_embeddings
‚úÖ Successfully connected to collection: 'research_paper_sections'
üìä Total documents in collection: 132

üìã Retrieved 132 chunks

üìÑ PAPER: 1
   Total Chunks: 24

  üìë Section: Introduction
     Chunks in this section: 3
     ----------------------------------------------------------------------

     Chunk 1:
       ID: paper-1-0-Introduction
       Metadata: {'section_category': 'Introduction', 'page_number': 1, 'paper_id': 1, 'chunk_number': 0}
       Page Number: 1
       Chunk Number: 0
       Text Preview: I. INTRODUCTION Writing the computer code necessary for software to run is known as software programming. The terms used in the Ô¨Åeld of computer techn...
       Text Length: 1248 characters

     Chunk 2:
       ID: paper-1-1-Introduction
       Metadata: {'paper_id': 1, 'page_number': 1, 'section_category': 'Introduction', 'chunk_number': 1}
       Page Number: 1
       Chunk Number: 1
       Text Preview: the entire