In [None]:
from typing import Dict, List, Optional, Tuple

import fitz
import matplotlib.pyplot as plt
import numpy as np
from langchain_ollama import ChatOllama, OllamaEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class DocumentProcessor:
    """
    Handles PDF text extraction and chunking operations.
    """

    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """
        Initialize the DocumentProcessor with chunking parameters.

        Args:
            chunk_size (int): Size of each text chunk in characters (default: 1000)
            chunk_overlap (int): Overlap between chunks in characters (default: 200)
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """
        Extract text from a PDF file using PyMuPDF (fitz).

        Args:
            pdf_path (str): Path to the PDF file

        Returns:
            Extracted text as a single string

        Raises:
            FileNotFoundError: If the PDF file doesn't exist
            Exception: For other extraction errors
        """
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text
        except FileNotFoundError:
            raise FileNotFoundError(f"The file {pdf_path} was not found.")
        except Exception as e:
            raise Exception(f"Error extracting text from PDF: {str(e)}")

    def chunk_text(self, text: str) -> List[str]:
        """
        Split text into overlapping chunks.

        Args:
            text (str): The input text to chunk

        Returns:
            List of text chunks
        """
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunks.append(text[start:end])
            if end == len(text):
                break
            start = end - self.chunk_overlap
        return chunks

In [None]:
class VectorStore:
    """
    A simple in-memory vector store for storing and searching embeddings.
    """

    def __init__(self):
        """Initialize the vector store with empty dictionaries."""
        self.embeddings = {}  # {chunk_id: embedding}
        self.metadata = {}  # {chunk_id: metadata}
        self.next_id = 0

    def add_embedding(self, embedding: np.ndarray, metadata: Dict) -> int:
        """
        Add an embedding to the vector store.

        Args:
            embedding (np.ndarray): The embedding vector to add
            metadata (metadata): Associated metadata for the embedding

        Returns:
            The assigned ID for the embedding
        """
        chunk_id = self.next_id
        self.embeddings[chunk_id] = embedding
        self.metadata[chunk_id] = metadata
        self.next_id += 1
        return chunk_id

    def find_similar(
        self, query_embedding: np.ndarray, top_k: int = 5
    ) -> List[Tuple[int, float]]:
        """
        Find the most similar embeddings to the query.

        Args:
            query_embedding (np.ndarray): The embedding to compare against
            top_k (int): Number of similar items to return (default: 5)

        Returns:
            List of tuples (chunk_id, similarity_score)
        """
        if not self.embeddings:
            return []

        # Convert embeddings to matrix
        ids = list(self.embeddings.keys())
        embeddings_matrix = np.array([self.embeddings[id] for id in ids])

        # Calculate cosine similarities
        similarities = cosine_similarity(
            query_embedding.reshape(1, -1), embeddings_matrix
        )[0]

        # Get top_k results
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return [(ids[i], similarities[i]) for i in top_indices]

    def get_text(self, chunk_id: int) -> str:
        """
        Get the text associated with a chunk ID.

        Args:
            chunk_id (int): The ID of the chunk

        Returns:
            The text content of the chunk

        Raises:
            KeyError: If the chunk ID doesn't exist
        """
        return self.metadata[chunk_id]["text"]

In [None]:
class RelevantSegmentExtract:
    """
    Implements the Relevant Segment Extraction (RSE) algorithm for RAG.
    """

    def __init__(self, ollama_model: str = "llama3.2:3b"):
        """
        Initialize the RSE core with Ollama models.

        Args:
            ollama_model (str): The name of the Ollama model to use
        """
        self.embedding_model = OllamaEmbeddings(model=ollama_model)
        self.llm = ChatOllama(model=ollama_model)
        self.vector_store = VectorStore()

    def generate_embeddings(self, chunks: List[str]) -> List[np.ndarray]:
        """
        Generate embeddings for a list of text chunks.

        Args:
            chunks (List[str]): List of text chunks to embed

        Returns:
            List of embedding vectors
        """
        embeddings = []
        for chunk in chunks:
            try:
                embedding = self.embedding_model.embed_query(chunk)
                embeddings.append(np.array(embedding))
            except Exception as e:
                print(f"Error generating embedding for chunk: {str(e)}")
                # Use zero vector as fallback
                embeddings.append(np.zeros(4096))  # Assuming 4096-dim embeddings
        return embeddings

    def compute_chunk_values(self, chunks: List[str], query: str) -> List[float]:
        """
        Compute relevance values for chunks based on a query.

        Args:
            chunks (List[str]): List of text chunks
            query (str): The search query

        Returns:
            List of relevance scores for each chunk
        """
        # Generate embeddings for chunks and query
        chunk_embeddings = self.generate_embeddings(chunks)
        query_embedding = np.array(self.embedding_model.embed_query(query))

        # Calculate cosine similarity between query and each chunk
        similarities = cosine_similarity(
            query_embedding.reshape(1, -1), np.array(chunk_embeddings)
        )[0]

        return similarities.tolist()

    def find_best_segments(
        self,
        chunks: List[str],
        chunk_values: List[float],
        top_k: int = 3,
        min_value: float = 0.3,
    ) -> List[Tuple[int, str, float]]:
        """
        Identify the most relevant segments based on computed values.

        Args:
            chunks (List[str]): List of text chunks
            chunk_values (List[float]): Computed relevance values for chunks
            top_k (int): Number of segments to return (default: 3)
            min_value (float): Minimum relevance score to consider (default: 0.3)

        Returns:
            List of tuples (index, chunk, value) for best segments
        """
        # Pair chunks with their values and filter by minimum value
        scored_chunks = [
            (i, chunks[i], chunk_values[i])
            for i in range(len(chunks))
            if chunk_values[i] >= min_value
        ]

        # Sort by value in descending order
        scored_chunks.sort(key=lambda x: x[2], reverse=True)

        # Return top_k segments
        return scored_chunks[:top_k]

    def visualize_chunk_relevance(
        self, chunk_values: List[float], title: str = "Chunk Relevance Distribution"
    ):
        """
        Visualize the distribution of chunk relevance scores.

        Args:
            chunk_values (List[float]): List of relevance scores
            title (str): Title for the plot
        """
        plt.figure(figsize=(10, 5))
        plt.plot(chunk_values, marker="o", linestyle="-", color="b")
        plt.title(title)
        plt.xlabel("Chunk Index")
        plt.ylabel("Relevance Score")
        plt.grid(True)
        plt.show()

    def process_document(
        self, pdf_path: str, query: str
    ) -> List[Tuple[int, str, float]]:
        """
        Full document processing pipeline with RSE.

        Args:
            pdf_path (str): Path to the PDF document
            query (str): The search query

        Returns:
            List of relevant segments (index, text, score)
        """
        try:
            # Step 1: Extract and chunk text
            processor = DocumentProcessor()
            text = processor.extract_text_from_pdf(pdf_path)
            chunks = processor.chunk_text(text)

            # Step 2: Compute chunk values
            chunk_values = self.compute_chunk_values(chunks, query)

            # Step 3: Visualize relevance distribution
            self.visualize_chunk_relevance(chunk_values)

            # Step 4: Find best segments
            best_segments = self.find_best_segments(chunks, chunk_values)

            # Step 5: Store chunks in vector store (for potential later use)
            embeddings = self.generate_embeddings(chunks)
            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
                self.vector_store.add_embedding(
                    embedding, {"text": chunk, "index": i, "score": chunk_values[i]}
                )

            return best_segments

        except Exception as e:
            print(f"Error processing document: {str(e)}")
            return []

    def generate_response_with_context(
        self, query: str, context_segments: List[str]
    ) -> str:
        """
        Generate a response using the LLM with RSE-provided context.

        Args:
            query (str): The user's query
            context_segments (List[str]): List of relevant context segments

        Returns:
            The generated response
        """
        try:
            # Combine context segments
            context = "\n\n".join(
                [f"Context {i + 1}:\n{seg}" for i, seg in enumerate(context_segments)]
            )

            # Create prompt with context
            prompt = f"""Answer the following question based on the provided context.
            
            Context:
            {context}
            
            Question: {query}
            
            Answer:"""

            # Generate response
            response = self.llm.invoke(prompt)
            return response.content

        except Exception as e:
            print(f"Error generating response: {str(e)}")
            return "I couldn't generate a response due to an error."

In [None]:
rse = RelevantSegmentExtract(ollama_model="llama3.2:3b")

In [None]:
query = "How to lose weight?"

In [None]:
pdf_path = "./dataset/health supplements/1. dietary supplements - for whom.pdf"

In [None]:
relevant_segments = rse.process_document(pdf_path, query)

In [None]:
print("\nRelevant Segments Found:")
for idx, segment, score in relevant_segments:
    print(f"\nSegment {idx} (Score: {score:.2f}):")
    print(segment[:200] + "...")

In [None]:
if relevant_segments:
    context_texts = [seg for _, seg, _ in relevant_segments]
    response = rse.generate_response_with_context(query, context_texts)
    print("\nGenerated Response:")
    print(response)