In [None]:
import os
from typing import Any, Dict, List, Optional, Tuple

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.text_splitter import SemanticChunker
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
class CCH:
    """
    A RAG pipeline with Contextual Chunk Headers (CCH) implementation using Ollama.
    """

    def __init__(self, model_name: str = "llama3.2:3b"):
        """
        Initialize the CCH RAG pipeline.

        Args:
            model_name (str): Name of the Ollama model to use (default: "llama3.2:3b")
        """
        self.model_name = model_name
        self.llm = None
        self.embeddings = None
        self.vectorstore = None
        self.documents = []

    def setup_environment(self) -> None:
        """
        Set up the environment including LLM and embeddings.
        """
        try:
            # Initialize Ollama LLM
            self.llm = ChatOllama(model=self.model_name)

            # Initialize Ollama Embeddings
            self.embeddings = OllamaEmbeddings(model=self.model_name)

            print("Environment setup complete with model:", self.model_name)

        except Exception as e:
            print(f"Error setting up environment: {e}")
            raise

    def load_and_extract_pdf(self, file_path: str) -> List[Document]:
        """
        Load and extract text from a PDF file with section headers.

        Args:
            file_path (str): Path to the PDF file

        Returns:
            List of Documents with extracted text and metadata
        """
        try:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"PDF file not found at {file_path}")

            # Load PDF with PYPDFLoader
            loader = PyPDFLoader(file_path)
            self.documents = loader.load()

            print(f"Successfully loaded {len(self.documents)} pages from PDF")
            return self.documents

        except Exception as e:
            print(f"Error loading PDF: {e}")
            raise

    def identify_section_headers(self, documents: List[Document]) -> List[Document]:
        """
        Identify section headers in documents using LLM.
        Adds header information to document metadata.

        Args:
            documents (List[Document]): List of document to process

        Returns:
            List of documents with header metadata
        """
        try:
            # Prompt to identify headers
            header_prompt = """Generate a concise and informative title for the given text.
            Return only the header text if found, otherwise return 'None':
            
            {page_content}"""

            prompt_template = ChatPromptTemplate.from_template(header_prompt)
            header_chain = prompt_template | self.llm | StrOutputParser()

            processed_docs = []
            for doc in documents:
                # Get header from LLM
                header = header_chain.invoke({"page_content": doc.page_content})

                # Add header to metadata
                metadata = doc.metadata.copy()
                metadata["header"] = header if header.lower() != "none" else None
                processed_docs.append(
                    Document(page_content=doc.page_content, metadata=metadata)
                )

            print("Section headers identified for documents.")
            return processed_docs

        except Exception as e:
            print(f"Error identify section headers: {e}")
            raise

    def chunk_with_contextual_headers(
        self,
        documents: List[Document],
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ) -> List[Document]:
        """
        Split documents into chunks while preserving contextual headers.

        Args:
            documents (List[Document]): List of documents to chunk
            chunk_size (int): Size of each chunk in characters (default: 1000)
            chunk_overlap (int): Overlap between chunks (default: 200)

        Returns:
            List of chunked documents with header context
        """
        try:
            # First split by sections (headers)
            section_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size * 2,
                chunk_overlap=chunk_overlap,
                separators=["\n\n", "\n", "  ", " "],
            )

            # Then split into smaller chunks with semantic awareness
            semantic_splitter = SemanticChunker(self.embeddings)

            chunked_docs = []
            for doc in documents:
                # Get header from metadata
                header = doc.metadata.get("header", "")

                # Split by sections first
                section_chunks = section_splitter.split_documents([doc])

                for section in section_chunks:
                    # Split sections into smaller semantic chunks
                    semantic_chunks = semantic_splitter.split_documents([section])

                    # Add header context to each chunk
                    for chunk in semantic_chunks:
                        chunk_metadata = chunk.metadata.copy()
                        chunk_metadata["header"] = header
                        chunked_docs.append(
                            Document(
                                page_content=chunk.page_content, metadata=chunk_metadata
                            )
                        )

            print(f"Created {len(chunked_docs)} chunks with contextual headers.")
            return chunked_docs

        except Exception as e:
            print(f"Error chunking documents: {e}")
            raise

    def create_vector_store(self, chunks: List[Document]) -> None:
        """
        Create a vector store from document chunks with embeddings.

        Args:
            chunks (List[Document]): List of document chunks to index
        """
        try:
            # Create FAISS vector store with Ollama embeddings
            self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
            print("Vector store created with", len(chunks), "chunks.")

        except Exception as e:
            print(f"Error creating vector store: {e}")
            raise

    def semantic_search(self, query: str, k: int = 4) -> List[Dict[str, str]]:
        """
        Perform semantic search on the vector store and return results in a formatted way.

        Args:
            query (str): Search query
            k (int): Number of results to return

        Return:
            List of dictionaries containing header and text for each chunk
        """
        try:
            if not self.vectorstore:
                raise ValueError("Vector store not initialized.")

            # Perform similarity search
            results = self.vectorstore.similarity_search(query, k=k)

            # Format results
            formatted_results = []
            for doc in results:
                formatted_results.append(
                    {
                        "header": doc.metadata.get("header", "No header"),
                        "text": doc.page_content,
                    }
                )

            return formatted_results

        except Exception as e:
            print(f"Error performing semantic search: {e}")

    def generate_response(self, query: str, context: List[Dict[str, str]]) -> str:
        """
        Generate a response using the LLM with retrieved context.

        Args:
            query (str): User query
            context (List[Dict[str, str]]): Retrieved relevant documents in formatted dictionary

        Returns:
            Generated response
        """
        try:
            # Prepare context string
            context_str = "\n\n---\n\n".join(
                [
                    f"Header: {chunk['header']}\nContent: {chunk['text']}"
                    for chunk in context
                ]
            )

            # Create RAG prompt
            prompt = ChatPromptTemplate.from_template("""
            You are a helpful assistant that answers questions based on the provided context.
            The context includes section headers that provide important structure.
            
            Context:
            {context}
            
            Question: {question}
            
            Provide a detailed answer based on the context. If the answer isn't in the context, 
            say you don't know. Pay attention to section headers as they indicate important topics.
            """)

            # Create and run chain
            chain = prompt | self.llm | StrOutputParser()
            response = chain.invoke({"question": query, "context": context_str})

            return response
        except Exception as e:
            print(f"Error generating response: {e}")
            raise

    def run_query(self, query: str) -> str:
        """
        Complete RAG pipeline for a query: search + generation.
        Prints the query and retrieved chunks in the requested format.

        Args:
            query (str): User query

        Returns:
            Generated response
        """
        try:
            # Step 1: Semantic search
            relevant_chunks = self.semantic_search(query)

            # Print query and chunks
            print("\nQuery:", query)
            for i, chunk in enumerate(relevant_chunks):
                print(f"\nHeader {i + 1}: {chunk['header']}")
                print(f"Content:\n{chunk['text']}\n")

            # Step 2: Generate response
            response = self.generate_response(query, relevant_chunks)

            return response
        except Exception as e:
            print(f"Error running query: {e}")
            raise

In [None]:
pdf_path = "./dataset/health supplements/1. dietary supplements - for whom.pdf"

In [None]:
rag = CCH(model_name="llama3.2:3b")

In [None]:
rag.setup_environment()

In [None]:
documents = rag.load_and_extract_pdf(pdf_path)

In [None]:
documents_with_headers = rag.identify_section_headers(documents)

In [None]:
chunks = rag.chunk_with_contextual_headers(documents_with_headers)

In [None]:
rag.create_vector_store(chunks)

In [None]:
query = "What is the main topic discussed in section 3?"

In [None]:
response = rag.run_query(query)

In [None]:
print(f"Response to '{query}':\n{response}")