In [None]:
import re
import fitz
import numpy as np
from typing import List, Dict, Tuple, Optional, Union
from sklearn.metrics.pairwise import cosine_similarity
from langchain_ollama import ChatOllama, OllamaEmbeddings
from IPython.display import display, Markdown
from rich.console import Console
from rich.text import Text
from rich.panel import Panel
from rich.table import Table
from rich.syntax import Syntax

In [None]:
class TextExtractor:
    """
    A class to handle PDF text extraction using PyMuPDF.

    Attributes:
        file_path (str): Path to the PDF file
    """

    def __init__(self, file_path: str):
        """
        Initialize the text extractor with the file path.

        Args:
            file_path: Path to the PDF file to extract text from
        """
        self.file_path = file_path

    def extract_text(self) -> str:
        """
        Extract all text from the PDF file.

        Returns:
            Extracted text as a single string

        Raises:
            FileNotFoundError: If the PDF file doesn't exist
            Exception: For other PDF reading errors
        """
        try:
            full_text = []
            with fitz.open(self.file_path) as doc:
                for page in doc:
                    full_text.append(page.get_text())

            return "\n".join(full_text)

        except FileNotFoundError:
            raise FileNotFoundError(f"The file {self.file_path} was not found.")

        except Exception as e:
            raise Exception(f"An error occurred while reading the PDF: {str(e)}")

In [None]:
class TextChunker:
    """
    A class to handle semantic chunking of text with overlapping chunks.

    Attributes:
        text (str): The text to be chunked
        chunk_size (int): Size of each chunk in tokens
        overlap (int): Overlap between chunks in tokens
    """

    def __init__(self, text: str, chunk_size: int = 256, overlap: int = 64):
        """
        Initialize the text chunker with text and chunking parameters.

        Args:
            text: The text to be chunked
            chunk_size: Desired size of each chunk in tokens
            overlap: Desired overlap between chunks in tokens
        """
        self.text = text
        self.chunk_size = chunk_size
        self.overlap = overlap

    def tokenize_approximate(self, text: str) -> List[str]:
        """
        Approximate tokenization by splitting on whitespace and punctuation.

        Args:
            text: Text to tokenize

        Returns:
            List of approximate tokens
        """
        tokens = re.findall(r'\w+|\S', text)
        return tokens

    def chunk_text(self) -> List[str]:
        """
        Split the text into overlapping chunks based on the specified parameters.

        Returns:
            List of text chunks
        """
        tokens = self.tokenize_approximate(self.text)
        chunks = []
        i = 0

        while i < len(tokens):
            # Calculate end position for this chunk
            end = min(i + self.chunk_size, len(tokens))

            # Get the tokens for this chunk
            chunk_tokens = tokens[i:end]
            chunk_text = ' '.join(chunk_tokens)
            chunks.append(chunk_text)

            # Move the starting position with overlap
            i += (self.chunk_size - self.overlap)

            # Ensure we don't get stuck in infinite loop with small chunks
            if end == len(tokens):
                break

        return chunks

In [None]:
class EmbeddingGenerator:
    """
    A class to handle text embedding generation using Ollama.

    Attributes:
        model_name (str): Name of the Ollama model to use
    """

    def __init__(self, model_name: str = 'llama3.2:3b'):
        """
        Initialize the embedding generator with the model name.

        Args:
            model_name (str): Name of the Ollama model to use for embeddings
        """
        self.model_name = model_name
        self.embedding_model = OllamaEmbeddings(model=model_name)

    def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings for a list of text strings.

        Args:
            texts (List[str]): List of text strings to embed

        Returns:
            List of embedding vectors
        """
        try:
            return self.embedding_model.embed_documents(texts)

        except Exception as e:
            raise Exception(f"Failed to generate embeddings: {str(e)}")

In [None]:
class SemanticSearch:
    """
    A class to perform semantic search on embedded text chunks.

    Attributes:
        chunks (List[str]): List of text chunks
        embeddings (List[List[float]): Corresponding embeddings for chunks
    """

    def __init__(self, chunks: List[str], embeddings: List[List[float]]):
        """
        Initialize the semantic search with chunks and their embeddings.

        Args:
            chunks (List[str]): List of text chunks
            embeddings (List[List[float]]): Corresponding embeddings for each chunk
        """
        self.chunks = chunks
        self.embeddings = np.array(embeddings)

    def query(self, query_text: str, query_embedding: List[float], top_k: int = 3) -> List[Tuple[str, float]]:
        """
        Perform a semantic search query and return top matching chunks.

        Args:
            query_text (str): The query text
            query_embedding (List[float]): The embedding of the query
            top_k (int): Number of top results to return

        Returns:
            List of tuples (chunk_text, similarity_score) sorted by score
        """
        try:
            # Calculate cosine similarities
            query_embedding = np.array(query_embedding).reshape(1, -1)
            similarities = cosine_similarity(query_embedding, self.embeddings)[0]

            # Get top k results
            top_indices = similarities.argsort()[-top_k:][::-1]
            results = [(self.chunks[i], similarities[i]) for i in top_indices]

            return results

        except Exception as e:
            raise Exception(f"Semantic search failed: {str(e)}")

In [None]:
class ResponseGenerator:
    """
    A class to generate responses using retrieved chunks and Ollama.

    Attributes:
        model_name (str): Name of the Ollama model to use
    """

    def __init__(self, model_name: str = 'llama3.2:3b'):
        """
        Initialize the response generator with the model name.

        Args:
            model_name (str): Name of the Ollama model to use for generation
        """
        self.model_name = model_name
        self.llm = ChatOllama(model=model_name)

    def generate_response(self, query: str, context_chunks: List[str]) -> str:
        """
        Generate a response to a query using the provided context chunks.

        Args:
            query (str): The user's query
            context_chunks (List[str]): List of relevant context chunks

        Returns:
            Generated response text
        """
        try:
            # Combine context results
            context = "\n\n".join([f"Context {i + 1}:\n{chunk}" for i, chunk in enumerate(context_chunks)])

            # Create prompt
            prompt = f"""You are a helpful assistant. Answer the user's question based on the provided context.

            Context:
            {context}

            Question: {query}

            Answer:"""

            # Generate response
            response = self.llm.invoke(prompt)
            return response.content

        except Exception as e:
            raise Exception(f"Failed to generate response: {str(e)}")

In [None]:
class ResponseEvaluator:
    """
    A class to evaluate generated responses based on faithfulness and relevancy.

    Attributes:
        model_name (str): Name of the Ollama model to use for evaluation
        max_entries (int): Maximum number of retries for evaluation
    """

    def __init__(self, model_name: str = 'llama3.2:3b', max_retries: int = 3):
        """
        Initialize the response evaluator with the model name.

        Args:
            model_name (str): Name of the Ollama model to use for evaluation
            max_retries (int): Maximum attempts to get a valid evaluation
        """
        self.model_name = model_name
        self.llm = ChatOllama(model=model_name)
        self.max_retries = max_retries

    def parse_evaluation(self, eval_text: str) -> Optional[Dict[str, float]]:
        """
        Parse the evaluation text to extract scores.

        Args:
            eval_text (str): Raw evaluation text from LLM

        Returns:
            Dictionary with scores if parsing succeeded, None otherwise
        """
        # Try to find two numbers in the text
        numbers = re.findall(r'\b\d+\.?\d*\b', eval_text)
        if len(numbers) >= 2:
            try:
                faithfulness = min(max(float(numbers[0]), 0.0), 1.0)
                relevancy = min(max(float(numbers[1]), 0.0), 1.0)
                return { 'faithfulness': faithfulness, 'relevancy': relevancy }

            except (ValueError, TypeError):
                return None

        return None

    def evaluate_response(self, query: str, context_chunks: List[str], response: str) -> Dict[str, float]:
        """
        Evaluate a generated response on faithfulness and relevancy.
        
        Args:
            query: The original user query
            context_chunks: List of context chunks used
            response: The generated response to evaluate
            
        Returns:
            Dictionary with 'faithfulness' and 'relevancy' scores (0-1)
            
        Raises:
            Exception: If evaluation fails after max retries
        """
        # Combine context chunks
        context = "\n\n".join([f"Context {i+1}:\n{chunk}" for i, chunk in enumerate(context_chunks)])
        
        # Create more explicit evaluation prompt
        prompt = f"""Please evaluate the response based on the following criteria:
        1. Faithfulness (0-1): How accurately the response reflects the provided context. 
           - 1.0: Perfectly matches the context
           - 0.5: Somewhat matches but has minor inaccuracies
           - 0.0: Completely contradicts or invents information not in context
        2. Relevancy (0-1): How well the response addresses the user's question.
           - 1.0: Directly and completely answers the question
           - 0.5: Partially answers or somewhat relevant
           - 0.0: Completely irrelevant to the question
        
        Provide ONLY two numbers between 0 and 1 separated by a single space, representing:
        faithfulness_score relevancy_score
        
        Example: "0.8 0.9"
        
        Context:
        {context}
        
        Question: {query}
        
        Response: {response}
        
        Evaluation scores:"""
        
        for attempt in range(self.max_retries):
            try:
                eval_response = self.llm.invoke(prompt)
                parsed = self.parse_evaluation(eval_response.content)
                
                if parsed:
                    return parsed
                else:
                    print(f"Retry {attempt + 1}: Could not parse evaluation, trying again...")
            except Exception as e:
                print(f"Retry {attempt + 1}: Evaluation failed: {str(e)}")
        
        # If we get here, all retries failed
        print("Warning: Returning default scores after evaluation failure")
        return {'faithfulness': 0.5, 'relevancy': 0.5}

In [None]:
class RAGPipeline:
    """
    A complete RAG pipeline implementation with semantic chunking.
    
    Attributes:
        pdf_path (str): Path to the PDF file
        chunk_sizes (List[int]): List of chunk sizes to compare
        overlap_ratio (float): Ratio of overlap between chunks (0-1)
        embedding_model (str): Name of the embedding model
        llm_model (str): Name of the LLM model
    """
    
    def __init__(self, pdf_path: str, 
                 chunk_sizes: List[int] = [128, 256, 512], 
                 overlap_ratio: float = 0.25,
                 embedding_model: str = 'llama3.2:3b',
                 llm_model: str = 'llama3.2:3b'):
        """
        Initialize the RAG pipeline.
        
        Args:
            pdf_path: Path to the PDF file
            chunk_sizes: List of chunk sizes to compare
            overlap_ratio: Ratio of overlap between chunks
            embedding_model: Name of the embedding model
            llm_model: Name of the LLM model
        """
        self.pdf_path = pdf_path
        self.chunk_sizes = chunk_sizes
        self.overlap_ratio = overlap_ratio
        self.embedding_model = embedding_model
        self.llm_model = llm_model
        
        # Initialize components
        self.text_extractor = TextExtractor(pdf_path)
        self.embedding_generator = EmbeddingGenerator(embedding_model)
        self.response_generator = ResponseGenerator(llm_model)
        self.response_evaluator = ResponseEvaluator(llm_model)
        
        # Storage for different chunking strategies
        self.chunk_data = {}
    
    def process_document(self):
        """
        Process the document by extracting text and creating chunks with different sizes.
        """
        try:
            # Extract text from PDF
            text = self.text_extractor.extract_text()
            
            # Create chunks for each chunk size
            for size in self.chunk_sizes:
                overlap = int(size * self.overlap_ratio)
                chunker = TextChunker(text, chunk_size=size, overlap=overlap)
                chunks = chunker.chunk_text()
                embeddings = self.embedding_generator.generate_embeddings(chunks)
                
                self.chunk_data[size] = {
                    'chunks': chunks,
                    'embeddings': embeddings,
                    'search': SemanticSearch(chunks, embeddings)
                }
                
        except Exception as e:
            raise Exception(f"Document processing failed: {str(e)}")
    
    def query(self, query_text: str, top_k: int = 3) -> Dict[int, Dict[str, Union[str, Dict[str, float]]]]:
        """
        Execute a query against all chunking strategies and return results.
        
        Args:
            query_text: The query text
            top_k: Number of chunks to retrieve
            
        Returns:
            Dictionary with results for each chunk size, containing:
            - retrieved_chunks: List of (chunk, similarity) tuples
            - response: Generated response
            - evaluation: Faithfulness and relevancy scores
        """
        try:
            # Generate query embedding
            query_embedding = self.embedding_generator.generate_embeddings([query_text])[0]
            
            results = {}
            
            for size, data in self.chunk_data.items():
                # Perform semantic search
                retrieved_chunks = data['search'].query(query_text, query_embedding, top_k)
                chunk_texts = [chunk for chunk, _ in retrieved_chunks]
                
                # Generate response
                response = self.response_generator.generate_response(query_text, chunk_texts)
                
                # Evaluate response
                evaluation = self.response_evaluator.evaluate_response(query_text, chunk_texts, response)
                
                results[size] = {
                    'retrieved_chunks': retrieved_chunks,
                    'response': response,
                    'evaluation': evaluation
                }
            
            return results
            
        except Exception as e:
            raise Exception(f"Query failed: {str(e)}")

In [None]:
pdf_path = "./dataset/health supplements/1. dietary supplements - for whom.pdf"

In [None]:
console = Console()

In [None]:
rag = RAGPipeline(pdf_path, chunk_sizes=[128, 256, 512])

In [None]:
rag.process_document()

In [None]:
query = "What are the main findings of this document?"

In [None]:
results = rag.query(query)

In [None]:
for size, result in results.items():
    title = Text(f"Results for Chunk Size: {size}", style="bold blue")
    
    faithfulness = result['evaluation']['faithfulness']
    relevancy = result['evaluation']['relevancy']
    
    faith_style = "green" if faithfulness > 0.7 else "yellow" if faithfulness > 0.4 else "red"
    rel_style = "green" if relevancy > 0.7 else "yellow" if relevancy > 0.4 else "red"
    
    eval_text = Text()
    eval_text.append("Evaluation - ", style="bold")
    eval_text.append(f"Faithfulness: {faithfulness:.2f}", style=faith_style)
    eval_text.append(", ", style="bold")
    eval_text.append(f"Relevancy: {relevancy:.2f}", style=rel_style)
    
    response = Syntax(result['response'], "python", theme="monokai", line_numbers=False)
    
    chunks_table = Table(title="Top Retrieved Chunks", show_header=True, header_style="bold magenta")
    chunks_table.add_column("Chunk #", style="cyan", no_wrap=True)
    chunks_table.add_column("Score", style="green")
    chunks_table.add_column("Content Preview")
    
    for i, (chunk, score) in enumerate(result['retrieved_chunks']):
        score_style = "green" if score > 0.7 else "yellow" if score > 0.4 else "red"
        chunks_table.add_row(
            str(i+1),
            f"[{score_style}]{score:.2f}[/]",
            chunk[:100] + "..."
        )
    
    console.print(Panel.fit(title))
    console.print(eval_text)
    console.print("\n[bold]Response:[/]")
    console.print(response)
    console.print(chunks_table)
    console.print("\n" + "="*80 + "\n")