In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import TokenTextSplitter
from config import DOCUMENT_PATH, EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_PATH, TOP_K,MAX_TOKENS_GENERATE,MEMORY_LENGTH,DOCUMENT_PATH_PERSONAL
from llm_loader import load_llm
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.node_parser import HierarchicalNodeParser, SemanticSplitterNodeParser
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SimilarityPostprocessor
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

from llama_index.core import PromptTemplate
from llama_index.core.chat_engine import CondenseQuestionChatEngine
from llama_index.core.llms import ChatMessage, MessageRole

from llama_index.packs.fusion_retriever.hybrid_fusion.base import HybridFusionRetrieverPack
# Retriever imports
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever


import chromadb
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

from helper import check_collection_exist
from datetime import datetime


In [2]:
llm = load_llm()
Settings.llm = llm  # Assign model globally in LlamaIndex

# Initialize and verify embedding model
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    cache_folder="./embedding_cache"
)

# Explicitly set the embedding model
Settings.embed_model = embed_model

# Verify the embedding model
print("Current embedding model:", type(Settings.embed_model).__name__)

# Test embedding generation
test_text = "This is a test sentence."
test_embedding = Settings.embed_model.get_text_embedding(test_text)
print(f"Successfully generated embedding of length: {len(test_embedding)}")


llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

Current embedding model: HuggingFaceEmbedding
Successfully generated embedding of length: 384


In [3]:
## Configurations
DELETE_FLAG = True
COLLECTION_NAME = "financial_reports_2"
#DOCUMENT_PATH = "./data/source_files/"

## Save into Chroma DB

In [6]:
class ChromaCollectionManager:
    def __init__(self, collection_name: str, db_path: str = "./chroma_db"):
        """Initialize ChromaDB collection manager."""
        self.collection_name = collection_name
        self.client = chromadb.PersistentClient(path=db_path)
        
    def get_or_create_collection(self, delete_existing: bool = False) -> chromadb.Collection:
        """Get existing collection or create new one."""
        try:
            if delete_existing:
                self.delete_collection()
                
            collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"created_at": datetime.now().isoformat()}
            )
            print(f"Successfully connected to collection '{self.collection_name}'")
            return collection
            
        except Exception as e:
            print(f"Error managing collection: {str(e)}")
            raise

    def delete_collection(self) -> None:
        """Delete collection if it exists."""
        try:
            self.client.delete_collection(self.collection_name)
            print(f"Deleted existing collection '{self.collection_name}'")
        except Exception:
            print("No existing collection to delete")

    def add_documents(self, collection: chromadb.Collection, 
                     documents_path: str,
                     chunk_size: int = 1024,
                     chunk_overlap: int = 50) -> None:
        """Add documents to collection with chunking."""
        try:
            # Load and process documents
            documents = SimpleDirectoryReader(documents_path).load_data()
            splitter = SentenceSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
            )
            nodes = splitter.get_nodes_from_documents(documents)

            # Prepare batches (ChromaDB recommends batching)
            batch_size = 100
            for i in range(0, len(nodes), batch_size):
                batch = nodes[i:i + batch_size]
                
                texts = [node.get_text() for node in batch]
                embeddings = [Settings.embed_model.get_text_embedding(text) for text in texts]
                ids = [f"doc_{i + idx}" for idx in range(len(batch))]
                
                collection.add(
                    ids=ids,
                    documents=texts,
                    embeddings=embeddings
                )
                
            print(f"Stored {len(nodes)} document chunks in collection '{self.collection_name}'")
            
        except Exception as e:
            print(f"Error adding documents: {str(e)}")
            raise

    def add_documents_semantic(self, collection: chromadb.Collection, 
                              documents_path: str,
                              chunk_size: int = 1024,
                              chunk_overlap: int = 50,
                              buffer_size: int = 1,
                              breakpoint_percentile: int = 95,
                              debug_mode: bool = False) -> None:
        """Add documents to collection using semantic chunking only.
        
        Args:
            collection: ChromaDB collection to add documents to
            documents_path: Path to documents to add
            chunk_size: Base chunk size before semantic splitting
            chunk_overlap: Amount of overlap between chunks
            buffer_size: Number of sentences to consider when calculating similarity
            breakpoint_percentile: Percentile threshold for creating chunk boundaries (higher = more chunks)
            debug_mode: Whether to print debug information
        """
        try:
            # Load documents
            documents = SimpleDirectoryReader(documents_path).load_data()
            print(f"Loaded {len(documents)} documents")
            
            # First use a basic sentence splitter to get initial chunks
            # This creates a baseline before semantic refinement
            basic_splitter = SentenceSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
            base_nodes = basic_splitter.get_nodes_from_documents(documents)
            
            # Apply semantic splitting
            semantic_splitter = SemanticSplitterNodeParser.from_defaults(
                buffer_size=buffer_size,
                breakpoint_percentile_threshold=breakpoint_percentile,
                embed_model=Settings.embed_model,
                include_metadata=True,
                include_prev_next_rel=True
            )
            semantic_nodes = semantic_splitter.get_nodes_from_documents(base_nodes)
            
            print(f"Created {len(semantic_nodes)} semantic nodes from {len(base_nodes)} base nodes")
            
            # Debug information if requested
            if debug_mode:
                base_lengths = [len(n.get_text()) for n in base_nodes]
                semantic_lengths = [len(n.get_text()) for n in semantic_nodes]
                
                print("\n=== Semantic Splitting Statistics ===")
                print(f"Before: {len(base_nodes)} base nodes")
                print(f"After:  {len(semantic_nodes)} semantic nodes")
                print(f"Change: {len(semantic_nodes) - len(base_nodes)} additional nodes created")
                
                print(f"\nAvg length before: {sum(base_lengths)/len(base_lengths):.2f} chars")
                print(f"Avg length after:  {sum(semantic_lengths)/len(semantic_lengths):.2f} chars")
                print(f"Max length before: {max(base_lengths)}")
                print(f"Max length after:  {max(semantic_lengths)}")
                
                # Sample a node to inspect
                if semantic_nodes:
                    sample = semantic_nodes[0]
                    print("\n=== Sample Semantic Node ===")
                    print(f"Text preview: {sample.get_text()[:100]}...")
                    print("\nMetadata:")
                    for key, value in sample.metadata.items():
                        print(f"  {key}: {value}")
            
            # Store the nodes in batches
            self._store_nodes_in_batches(collection, semantic_nodes, "semantic")
            
        except Exception as e:
            print(f"Error adding documents with semantic chunking: {str(e)}")
            raise

    def add_documents_hierarchical(self, collection: chromadb.Collection, 
                                  documents_path: str,
                                  chunk_sizes: list = [1024, 512, 128],
                                  chunk_overlap: int = 50,
                                  debug_mode: bool = False) -> None:
        """Add documents to collection using hierarchical chunking only.
        
        Args:
            collection: ChromaDB collection to add documents to
            documents_path: Path to documents to add
            chunk_sizes: List of chunk sizes for different hierarchy levels
            chunk_overlap: Amount of overlap between chunks
            debug_mode: Whether to print debug information
        """
        try:
            # Load documents
            documents = SimpleDirectoryReader(documents_path).load_data()
            print(f"Loaded {len(documents)} documents")
            
            # Create hierarchical nodes
            h_node_parser = HierarchicalNodeParser.from_defaults(
                chunk_sizes=chunk_sizes,
                chunk_overlap=chunk_overlap
            )
            hierarchical_nodes = h_node_parser.get_nodes_from_documents(documents)
            
            print(f"Created {len(hierarchical_nodes)} hierarchical nodes")
            
            # Debug information if requested
            if debug_mode:
                # Create a hierarchy map
                hierarchy_map = {}
                for node in hierarchical_nodes:
                    level = node.metadata.get("level", 0)
                    if level not in hierarchy_map:
                        hierarchy_map[level] = 0
                    hierarchy_map[level] += 1
                
                # Display hierarchy statistics
                print("\n=== Hierarchical Structure ===")
                for level in sorted(hierarchy_map.keys()):
                    print(f"Level {level}: {hierarchy_map[level]} nodes")
                
                # Sample nodes from each level for inspection
                print("\n=== Sample Nodes by Level ===")
                for level in sorted(hierarchy_map.keys()):
                    sample_nodes = [n for n in hierarchical_nodes if n.metadata.get("level", 0) == level]
                    if sample_nodes:
                        sample = sample_nodes[0]
                        print(f"\nLEVEL {level} SAMPLE (token length: {len(sample.get_text().split())})")
                        # Print truncated text
                        sample_text = sample.get_text()
                        print(f"{sample_text[:100]}..." if len(sample_text) > 100 else sample_text)
                        
                        # Show metadata
                        print("\nMetadata:")
                        for key, value in sample.metadata.items():
                            # Truncate lengthy metadata values
                            if isinstance(value, list) and len(value) > 3:
                                print(f"  {key}: {value[:3]}... ({len(value)} items)")
                            else:
                                print(f"  {key}: {value}")
            
            # Store the nodes in batches
            self._store_nodes_in_batches(collection, hierarchical_nodes, "hierarchical")
            
        except Exception as e:
            print(f"Error adding documents with hierarchical chunking: {str(e)}")
            raise

    def add_documents_hierarchical_semantic(self, collection: chromadb.Collection, 
                                           documents_path: str,
                                           chunk_sizes: list = [1024, 512, 128],
                                           chunk_overlap: int = 50,
                                           breakpoint_percentile: int = 95,
                                           debug_mode: bool = False) -> None:
        """Add documents using both hierarchical and semantic chunking.
        
        Args:
            collection: ChromaDB collection to add documents to
            documents_path: Path to documents to add
            chunk_sizes: List of chunk sizes for different hierarchy levels
            chunk_overlap: Amount of overlap between chunks
            breakpoint_percentile: Percentile threshold for semantic splitting
            debug_mode: Whether to print debug information
        """
        try:
            # Load documents
            documents = SimpleDirectoryReader(documents_path).load_data()
            print(f"Loaded {len(documents)} documents")
            
            # Step 1: Create hierarchical nodes
            h_node_parser = HierarchicalNodeParser.from_defaults(
                chunk_sizes=chunk_sizes,
                chunk_overlap=chunk_overlap
            )
            hierarchical_nodes = h_node_parser.get_nodes_from_documents(documents)
            
            print(f"Created {len(hierarchical_nodes)} hierarchical nodes")
            
            # Debug hierarchical structure if requested
            if debug_mode:
                self._debug_hierarchical_nodes(hierarchical_nodes)
            
            # Step 2: Refine with semantic splitting
            s_node_parser = SemanticSplitterNodeParser.from_defaults(
                buffer_size=1,
                breakpoint_percentile_threshold=breakpoint_percentile,
                embed_model=Settings.embed_model,
                include_metadata=True,
                include_prev_next_rel=True
            )
            semantic_nodes = s_node_parser.get_nodes_from_documents(hierarchical_nodes)
            
            print(f"Further refined into {len(semantic_nodes)} semantic nodes")
            
            # Debug semantic impact if requested
            if debug_mode:
                self._debug_semantic_impact(hierarchical_nodes, semantic_nodes)
            
            # Store the nodes in batches
            self._store_nodes_in_batches(collection, semantic_nodes, "hierarchical-semantic")
            
        except Exception as e:
            print(f"Error adding documents with hierarchical-semantic chunking: {str(e)}")
            raise

    def _store_nodes_in_batches(self, collection, nodes, chunking_type="default", batch_size=100):
        """Helper method to store nodes in batches with appropriate metadata."""
        total_added = 0
        
        for i in range(0, len(nodes), batch_size):
            batch = nodes[i:i + batch_size]
            
            texts = [node.get_text() for node in batch]
            
            # Extract and enrich metadata
            metadatas = []
            for node in batch:
                # Start with basic metadata
                metadata = {
                    "document_id": node.id_,
                    "file_name": node.metadata.get("file_name", ""),
                    "chunking_type": chunking_type
                }
                
                # Add hierarchical metadata if available
                if "level" in node.metadata:
                    metadata["hierarchy_level"] = node.metadata["level"]
                if "parent_id" in node.metadata:
                    metadata["parent_id"] = node.metadata["parent_id"]
                if "child_ids" in node.metadata:
                    metadata["child_count"] = len(node.metadata["child_ids"])
                    metadata["child_ids"] = str(node.metadata["child_ids"])  # Convert list to string for Chroma
                
                # Add semantic relationship metadata if available
                if "next_node_id" in node.metadata:
                    metadata["next_node_id"] = node.metadata["next_node_id"]
                if "prev_node_id" in node.metadata:
                    metadata["prev_node_id"] = node.metadata["prev_node_id"]
                
                metadatas.append(metadata)
            
            # Generate embeddings
            embeddings = [Settings.embed_model.get_text_embedding(text) for text in texts]
            
            # Create unique IDs
            ids = [f"{chunking_type}_{total_added + idx}" for idx in range(len(batch))]
            
            # Add to collection
            collection.add(
                ids=ids,
                documents=texts,
                embeddings=embeddings,
                metadatas=metadatas
            )
            
            total_added += len(batch)
        
        print(f"Stored {total_added} {chunking_type} document chunks in collection '{self.collection_name}'")
    
    def _debug_hierarchical_nodes(self, nodes):
        """Helper method to display debug info for hierarchical nodes."""
        # Create a hierarchy map
        hierarchy_map = {}
        for node in nodes:
            level = node.metadata.get("level", 0)
            if level not in hierarchy_map:
                hierarchy_map[level] = 0
            hierarchy_map[level] += 1
        
        # Display hierarchy statistics
        print("\n=== Hierarchical Structure ===")
        for level in sorted(hierarchy_map.keys()):
            print(f"Level {level}: {hierarchy_map[level]} nodes")
        
        # Sample nodes from each level for inspection
        print("\n=== Sample Nodes by Level ===")
        for level in sorted(hierarchy_map.keys()):
            sample_nodes = [n for n in nodes if n.metadata.get("level", 0) == level]
            if sample_nodes:
                sample = sample_nodes[0]
                print(f"\nLEVEL {level} SAMPLE (token length: {len(sample.get_text().split())})")
                # Print truncated text
                sample_text = sample.get_text()
                print(f"{sample_text[:100]}..." if len(sample_text) > 100 else sample_text)
                
                # Show metadata
                print("\nMetadata:")
                for key, value in sample.metadata.items():
                    # Truncate lengthy metadata values
                    if isinstance(value, list) and len(value) > 3:
                        print(f"  {key}: {value[:3]}... ({len(value)} items)")
                    else:
                        print(f"  {key}: {value}")
    
    def _debug_semantic_impact(self, before_nodes, after_nodes):
        """Helper method to display debug info comparing before/after semantic splitting."""
        # Analyze text length changes
        before_lengths = [len(n.get_text()) for n in before_nodes]
        after_lengths = [len(n.get_text()) for n in after_nodes]
        
        print("\n=== Semantic Splitting Impact ===")
        print(f"Before: {len(before_nodes)} nodes")
        print(f"After:  {len(after_nodes)} nodes")
        print(f"Change: {len(after_nodes) - len(before_nodes)} additional nodes created")
        
        print(f"\nAvg length before: {sum(before_lengths)/len(before_lengths):.2f} chars")
        print(f"Avg length after:  {sum(after_lengths)/len(after_lengths):.2f} chars")
        print(f"Max length before: {max(before_lengths)}")
        print(f"Max length after:  {max(after_lengths)}")

    # def add_documents_hierarchical_semantic(self, collection: chromadb.Collection, 
    #                                     documents_path: str,
    #                                     chunk_sizes: list = [2048, 512, 128],
    #                                     chunk_overlap: int = 50,
    #                                     debug_mode: bool = True,
    #                                     semantic_enabled: bool = True) -> None:
    #     """Add documents to collection using hierarchical and optionally semantic chunking.
        
    #     Args:
    #         collection: ChromaDB collection to add documents to
    #         documents_path: Path to documents to add
    #         chunk_sizes: List of chunk sizes for different hierarchy levels
    #         chunk_overlap: Amount of overlap between chunks
    #         debug_mode: Whether to print debug information about the hierarchy
    #         semantic_enabled: Whether to enable semantic splitting after hierarchical parsing
    #     """
    #     try:
    #         # Verify embedding model before processing
    #         if not isinstance(Settings.embed_model, HuggingFaceEmbedding):
    #             raise ValueError(f"Expected HuggingFaceEmbedding, but got {type(Settings.embed_model).__name__}")
            
    #         # Load documents
    #         documents = SimpleDirectoryReader(documents_path).load_data()
    #         print(f"Loaded {len(documents)} documents")
            
    #         # Step 1: Create hierarchical nodes
    #         h_node_parser = HierarchicalNodeParser.from_defaults(
    #             chunk_sizes=chunk_sizes,
    #             chunk_overlap=chunk_overlap
    #         )
    #         hierarchical_nodes = h_node_parser.get_nodes_from_documents(documents)
            
    #         print(f"Created {len(hierarchical_nodes)} hierarchical nodes")
            
    #         # Debug: Visualize the hierarchical structure
    #         if debug_mode:
    #             # Create a hierarchy map
    #             hierarchy_map = {}
    #             for node in hierarchical_nodes:
    #                 level = node.metadata.get("level", 0)
    #                 if level not in hierarchy_map:
    #                     hierarchy_map[level] = 0
    #                 hierarchy_map[level] += 1
                
    #             # Display hierarchy statistics
    #             print("\n=== Hierarchical Structure ===")
    #             for level in sorted(hierarchy_map.keys()):
    #                 print(f"Level {level}: {hierarchy_map[level]} nodes")
                
    #             # Sample nodes from each level for inspection
    #             print("\n=== Sample Nodes by Level ===")
    #             for level in sorted(hierarchy_map.keys()):
    #                 sample_nodes = [n for n in hierarchical_nodes if n.metadata.get("level", 0) == level]
    #                 if sample_nodes:
    #                     sample = sample_nodes[0]
    #                     print(f"\nLEVEL {level} SAMPLE (token length: {len(sample.get_text().split())})")
    #                     # Print truncated text
    #                     sample_text = sample.get_text()
    #                     print(f"{sample_text[:100]}..." if len(sample_text) > 100 else sample_text)
                        
    #                     # Show metadata
    #                     print("\nMetadata:")
    #                     for key, value in sample.metadata.items():
    #                         # Truncate lengthy metadata values
    #                         if isinstance(value, list) and len(value) > 3:
    #                             print(f"  {key}: {value[:3]}... ({len(value)} items)")
    #                         else:
    #                             print(f"  {key}: {value}")
            
    #         # Choose whether to apply semantic parsing
    #         if semantic_enabled:
    #             # Step 2: Refine with semantic splitting
    #             s_node_parser = SemanticSplitterNodeParser.from_defaults(
    #                 buffer_size=1,
    #                 breakpoint_percentile_threshold=95,
    #                 embed_model=Settings.embed_model,
    #                 include_metadata=True,
    #                 include_prev_next_rel=True
    #             )
    #             final_nodes = s_node_parser.get_nodes_from_documents(hierarchical_nodes)
    #             print(f"Further refined into {len(final_nodes)} semantic nodes")
                
    #             # Debug: Compare pre and post semantic splitting
    #             if debug_mode:
    #                 # Display semantic splitting statistics
    #                 print("\n=== Semantic Splitting Impact ===")
    #                 print(f"Before: {len(hierarchical_nodes)} hierarchical nodes")
    #                 print(f"After:  {len(final_nodes)} semantic nodes")
    #                 print(f"Change: {len(final_nodes) - len(hierarchical_nodes)} additional nodes created")
                    
    #                 # Analyze text length changes
    #                 h_lengths = [len(n.get_text()) for n in hierarchical_nodes]
    #                 s_lengths = [len(n.get_text()) for n in final_nodes]
                    
    #                 print(f"\nAvg length before: {sum(h_lengths)/len(h_lengths):.2f} chars")
    #                 print(f"Avg length after:  {sum(s_lengths)/len(s_lengths):.2f} chars")
    #                 print(f"Max length before: {max(h_lengths)}")
    #                 print(f"Max length after:  {max(s_lengths)}")
    #         else:
    #             # Use hierarchical nodes directly
    #             final_nodes = hierarchical_nodes
    #             print(f"Using {len(final_nodes)} hierarchical nodes without semantic refinement")
            
    #         # Prepare batches with enhanced metadata
    #         batch_size = 100
    #         total_added = 0
            
    #         for i in range(0, len(final_nodes), batch_size):
    #             batch = final_nodes[i:i + batch_size]
                
    #             texts = [node.get_text() for node in batch]
                
    #             # Extract hierarchical metadata
    #             metadatas = []
    #             for node in batch:
    #                 # Extract basic metadata
    #                 metadata = {
    #                     "document_id": node.id_,
    #                     "file_name": node.metadata.get("file_name", ""),
    #                     "hierarchy_level": node.metadata.get("level", 0),
    #                 }
                    
    #                 # Add relationship metadata if available
    #                 if "parent_id" in node.metadata:
    #                     metadata["parent_id"] = node.metadata["parent_id"]
    #                 if "child_ids" in node.metadata:
    #                     metadata["child_count"] = len(node.metadata["child_ids"])
                    
    #                 # Add semantic relationship info if available
    #                 if "next_node_id" in node.metadata:
    #                     metadata["next_node_id"] = node.metadata["next_node_id"]
    #                 if "prev_node_id" in node.metadata:
    #                     metadata["prev_node_id"] = node.metadata["prev_node_id"]
                    
    #                 metadatas.append(metadata)
                
    #             # Generate embeddings
    #             embeddings = [Settings.embed_model.get_text_embedding(text) for text in texts]
                
    #             # Create unique IDs
    #             ids = [f"node_{total_added + idx}" for idx in range(len(batch))]
                
    #             # Add to collection
    #             collection.add(
    #                 ids=ids,
    #                 documents=texts,
    #                 embeddings=embeddings,
    #                 metadatas=metadatas
    #             )
                
    #             total_added += len(batch)
            
    #         parsing_type = "hierarchical-semantic" if semantic_enabled else "hierarchical"
    #         print(f"Stored {total_added} {parsing_type} document chunks in collection '{self.collection_name}'")
            
    #     except Exception as e:
    #         print(f"Error adding documents: {str(e)}")
    #         raise


In [8]:
def testing(documents_path = DOCUMENT_PATH_PERSONAL, chunk_sizes = [2048, 512, 128], chunk_overlap = 50):
    documents = SimpleDirectoryReader(documents_path).load_data()
    print(f"Loaded {len(documents)} documents")

    # Step 1: Create hierarchical nodes
    h_node_parser = HierarchicalNodeParser.from_defaults(
        chunk_sizes=chunk_sizes,
        chunk_overlap=chunk_overlap
    )
    hierarchical_nodes = h_node_parser.get_nodes_from_documents(documents)

    return hierarchical_nodes

test_df = testing()

Loaded 99 documents


In [9]:
test_df[:10]

[TextNode(id_='578c9d3b-060e-41b5-a7b5-4baced5c31cf', embedding=None, metadata={'page_label': '1', 'file_name': '2024q4-alphabet-earnings-release.pdf', 'file_path': '/Users/danielmak/Documents/local_RAG/Document_Personal/2024q4-alphabet-earnings-release.pdf', 'file_type': 'application/pdf', 'file_size': 148726, 'creation_date': '2025-02-19', 'last_modified_date': '2025-02-12'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f2ed16ed-8b55-46de-8ddd-86b1d4c4bb1d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': '2024q4-alphabet-earnings-release.pdf', 'file_path': '/Users/danielmak/Documents/local_RAG/Document_Personal/2024q4-alphabet-earnings-release.pdf', 'file_type': 'applic

In [7]:
# Initialize manager
collection_manager = ChromaCollectionManager(COLLECTION_NAME)

# Get or create collection (with optional deletion of existing)
chroma_collection = collection_manager.get_or_create_collection(delete_existing=DELETE_FLAG)

# Add documents if needed
# if DELETE_FLAG or chroma_collection.count() == 0:
#     collection_manager.add_documents(
#         collection=chroma_collection,
#         documents_path=DOCUMENT_PATH_PERSONAL
#     )

if DELETE_FLAG or chroma_collection.count() == 0:
    collection_manager.add_documents_semantic(
        collection=chroma_collection,
        documents_path=DOCUMENT_PATH_PERSONAL,
        chunk_size=1024,
        chunk_overlap=50,
        debug_mode=True
    )

# Set up vector store
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

Deleted existing collection 'financial_reports_2'
Successfully connected to collection 'financial_reports_2'
Loaded 99 documents
Created 207 semantic nodes from 110 base nodes

=== Semantic Splitting Statistics ===
Before: 110 base nodes
After:  207 semantic nodes
Change: 97 additional nodes created

Avg length before: 2225.07 chars
Avg length after:  1182.41 chars
Max length before: 5237
Max length after:  5188

=== Sample Semantic Node ===
Text preview: Alphabet Announces Fourth Quarter and Fiscal Year 2024 Results
MOUNTAIN VIEW, Calif. – February 4, 2...

Metadata:
  page_label: 1
  file_name: 2024q4-alphabet-earnings-release.pdf
  file_path: /Users/danielmak/Documents/local_RAG/Document_Personal/2024q4-alphabet-earnings-release.pdf
  file_type: application/pdf
  file_size: 148726
  creation_date: 2025-02-19
  last_modified_date: 2025-02-12
Stored 207 semantic document chunks in collection 'financial_reports_2'


## Load From Chroma DB

In [8]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("financial_reports_2")
#chroma_collection = chroma_client.get_or_create_collection("datalab_demo")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context
)
# 1. Vector Retriever - uses vector store
vector_retriever = index.as_retriever(
    similarity_top_k=5
)

# 2. BM25 Retriever - uses docstore
bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore,  # Uses the docstore from your index
    similarity_top_k=5
)

# Hybrid fusion using both retrievers
fusion_retriever = HybridFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=5
)


  avg_doc_len = np.array([len(doc_ids) for doc_ids in corpus_token_ids]).mean()
  ret = ret.dtype.type(ret / rcount)


ValueError: max() arg is an empty sequence

In [6]:

custom_prompt = PromptTemplate(
    """\
Rewrite the user's follow-up question as a standalone question.

1. Include all relevant past context.
2. Keep it natural and grammatically correct.
3. If already standalone, return it unchanged.

<Chat History>
{chat_history}

<User's Follow-Up Question>
{question}

<Rewritten Standalone Question>
"""
)


response_prompt = PromptTemplate(
    """\
You are an AI assistant providing structured responses.

### **Instructions:**
- Answer clearly and concisely.
- Summarize retrieved context to avoid duplication.
- Summarize the key facts efficiently.
- If the context lacks enough details, say: "I don’t have enough information."
- Format responses in natural sentences.

<Retrieved Context>
{context}

<User's Query>
{question}

### **AI Response:**
"""
)


In [7]:
# 1. First, make sure your index has documents
print(f"Number of documents in docstore: {len(index.docstore.docs)}")

Number of documents in docstore: 0


In [6]:
# 2. Create a query engine with the fusion retriever
fusion_query_engine = RetrieverQueryEngine.from_args(
    retriever=fusion_retriever,
    response_synthesizer=index.as_query_engine().response_synthesizer,
    response_mode="compact",
    response_prompt=response_prompt,
    max_tokens=MAX_TOKENS_GENERATE,
    streaming=False
)

# 3. Create a chat engine with the fusion query engine
fusion_chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=fusion_query_engine,
    memory=memory,
    verbose=False,
)

  avg_doc_len = np.array([len(doc_ids) for doc_ids in corpus_token_ids]).mean()
  ret = ret.dtype.type(ret / rcount)


ValueError: max() arg is an empty sequence

In [9]:

query_engine = index.as_query_engine(
    response_mode="compact",
    response_prompt=response_prompt,
    similarity_top_k=TOP_K,
    max_tokens = MAX_TOKENS_GENERATE,
    streaming=False
)
memory = ChatMemoryBuffer.from_defaults(token_limit=MEMORY_LENGTH)

chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=query_engine,
    memory=memory,
    #condense_question_prompt=custom_prompt,
    verbose=False,
)


# print("✅ Chat engine initialized successfully!")

In [10]:
question = "What insights can you share about the financial performance?"


import time
start_time = time.time()
response = chat_engine.chat(question)
end_time = time.time()

print(response)
print(f"\n⏳ Response Time: {end_time - start_time:.2f} seconds")


The context information suggests that the document is related to a company's earnings presentation for the fourth quarter of 2024. The presentation includes financial metrics such as revenue and financial facts sheet. The company uses non-GAAP financial measures like free cash flow, constant currency revenues, and percentage change in constant currency revenues to supplement their consolidated financial statements prepared in accordance with GAAP. The non-GAAP financial measures are used for financial and operational decision-making and to evaluate period-to-period comparisons. The company believes that these non-GAAP financial measures provide meaningful supplemental information and are useful to investors. However, it is important to note that these non-GAAP financial measures are not intended to be considered in isolation or as a substitute for GAAP financial information. The limitations of using non-GAAP financial measures are compensated by providing specific information regardin

In [9]:
question = "How is the cloud business doing?"


import time
start_time = time.time()
response = chat_engine.chat(question)
end_time = time.time()
print("\n🧠 AI Response:\n")
print(response)
print(f"\n⏳ Response Time: {end_time - start_time:.2f} seconds")


🧠 AI Response:


The financial performance of the company's cloud business is strong, as evidenced by a 30% increase in revenues to $12.0 billion in Q4 2024 compared to the previous year. This growth is attributed to growth in Google Cloud Platform (GCP) across core GCP products, AI Infrastructure, and Generative AI Solutions.

⏳ Response Time: 22.00 seconds


In [None]:
question = "Any developments for the upcoming year in these areas?"


import time
start_time = time.time()
response = chat_engine.chat(question)
end_time = time.time()
print("\n🧠 AI Response:\n")
print(response)
print(f"\n⏳ Response Time: {end_time - start_time:.2f} seconds")

In [None]:
import json
memory_out = memory.to_string()
formatted_json = json.loads(memory_out)

In [None]:
print(json.dumps(formatted_json, indent=4)) 

In [None]:
## 

## Retriever Evaluation

In [None]:
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex


In [None]:
# a basic RAG pipeline, uses service context defaults
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

# generate prediction dataset
prediction_dataset = await rag_dataset.amake_predictions_with(
    query_engine=query_engine, show_progress=True
)