In [None]:
import os
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import fitz
import numpy as np
from langchain_ollama import ChatOllama, OllamaEmbeddings
from sentence_transformers import CrossEncoder

In [None]:
class TextExtractor:
    """
    A class to handle PDF text extraction and preprocessing.

    Attributes:
        file_path (str): Path to the PDF file
        chunk_size (int): Size of text chunks in characters
        chunk_overlap (int): Overlap between chunks in characters
    """

    def __init__(
        self, file_path: str, chunk_size: int = 1000, chunk_overlap: int = 200
    ):
        """
        Initialize the PDF text extractor.

        Args:
            file_path (str): Path to the PDF file.
            chunk_size (int): Size of text chunks in characters
            chunk_overlap (int): Overlap between in characters
        """
        self.file_path = file_path
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def extract_text(self) -> str:
        """
        Extract raw text from the PDF file.

        Returns:
            Extracted text as a single string

        Raises:
            FileNotFoundError: If the PDF file doesn't exist
            Exception: For other PDF reading errors
        """
        if not Path(self.file_path).exists():
            raise FileNotFoundError(f"PDF file not found at {self.file_path}")

        try:
            doc = fitz.open(self.file_path)
            text = "\n".join(page.get_text() for page in doc)
            return text

        except Exception as e:
            raise Exception(f"Error reading PDF: {str(e)}")

    def chunk_text(self, text: str) -> List[str]:
        """
        Split text into overlapping chunks of specified size.

        Args:
            text (str): Text to be chunked

        Returns:
            List of text chunks
        """
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunks.append(text[start:end])
            if end == len(text):
                break
            start = end - self.chunk_overlap

        return chunks

    def process(self) -> List[str]:
        """
        Complete processing pipeline: extract and chunk text.

        Returns:
            List of processed text chunks
        """
        raw_text = self.extract_text()
        return self.chunk_text(raw_text)

In [None]:
class VectorStore:
    """
    A simple in-memory vector store for document chunks and their embeddings.

    Attributes:
        embeddings_model (OllamaEmbeddings): Model for generating embeddings
        chunks (List[str]): List of document chunks
        embeddings (List[List[float]]): Corresponding embeddings for each chunk
    """

    def __init__(self, embeddings_model: OllamaEmbeddings):
        """
        Initialize the vector store.

        Args:
            embeddings_model (OllamaEmbeddings): Initialized Ollama embeddings model
        """
        self.embeddings_model = embeddings_model
        self.chunks = []
        self.embeddings = []

    def add_documents(self, chunks: List[str]) -> None:
        """
        Add documents to the store and generate their embeddings.

        Args:
            chunks (List[str]): List of text chunks to add
        """
        self.chunks.extend(chunks)
        # Generate embeddings for new chunks
        new_embeddings = self.embeddings_model.embed_documents(chunks)
        self.embeddings.extend(new_embeddings)

    def similarity_search(
        self, query_embedding: List[float], top_k: int = 5
    ) -> List[Tuple[str, float]]:
        """
        Find most similar documents to the query using cosine similarity.

        Args:
            query_embedding (List[float]): Embedding of the query
            top_k (int): Number of top results to return

        Returns:
            List of tuples (chunk, similarity_score)
        """
        if not self.embeddings:
            return []

        # Convert to numpy arrays for efficient computation
        query_arr = np.array(query_embedding)
        embeddings_arr = np.array(self.embeddings)

        # Calculate cosine similarities
        norms = np.linalg.norm(embeddings_arr, axis=1) * np.linalg.norm(query_arr)
        similarities = np.dot(embeddings_arr, query_arr) / norms

        # Get top_k results
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return [(self.chunks[i], similarities[i]) for i in top_indices]

In [None]:
class Reranker:
    """
    Handles reranking of retrieved documents using a cross-encoder model.
    """

    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        """
        Initialize the reranker with a cross-encoder model.

        Args:
            model_name (str): Name of the sentence-transformers cross-encoder model
        """
        self.model = CrossEncoder(model_name)

    def rerank(self, query: str, documents: List[str]) -> List[Tuple[str, float]]:
        """
        Rerank documents based on their relevance to the query.

        Args:
            query (str): The search query
            documents (List[str]): List of document chunks to rerank

        Returns:
            List of tuples (document, relevance_score) sorted by relevance
        """
        if not documents:
            return []

        # Create query-document pairs for cross-encoder
        pairs = [[query, doc] for doc in documents]

        # Get scores from cross-encoder
        scores = self.model.predict(pairs)

        # Combine documents with their scores and sort
        scored_docs = list(zip(documents, scores))
        return sorted(scored_docs, key=lambda x: x[1], reverse=True)

In [None]:
class RAG:
    """
    Complete RAG system with retrieval, reranking, and generation components.
    """

    def __init__(
        self,
        pdf_path: str,
        embeddings_model_name: str = "llama3.2:3b",
        llm_model_name: str = "llama3.2:3b",
        cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
    ):
        """
        Initialize the RAG system.

        Args:
            pdf_path (str): Path to the PDF document
            embeddings_model_name (str): Name of the Ollama embeddings model
            llm_model_name (str): Name of the Ollama LLM model
            cross_encoder_model (str): Name of the cross-encoder model for reranking
        """
        # Initialize models
        self.embeddings_model = OllamaEmbeddings(model=embeddings_model_name)
        self.llm = ChatOllama(model=llm_model_name)
        self.reranker = Reranker(cross_encoder_model)

        # Set up vector store
        self.vector_store = VectorStore(self.embeddings_model)

        # Process and index the PDF
        self._process_document(pdf_path)

    def _process_document(self, pdf_path: str) -> None:
        """
        Process the PDF document and populate the vector store.

        Args:
            pdf_path (str): Path to the PDF file
        """
        extractor = TextExtractor(pdf_path)
        chunks = extractor.process()
        self.vector_store.add_documents(chunks)

    def retrieve(self, query: str, top_k: int = 10, rerank_top_k: int = 5) -> List[str]:
        """
        Retrieve relevant documents for a query with reranking.

        Args:
            query (str): The search query
            top_k (int): Number of documents to retrieve initially
            rerank_top_k (int): Number of documents to return after reranking

        Returns:
            List of relevant document chunks
        """
        # Generate query embedding
        query_embedding = self.embeddings_model.embed_query(query)

        # First-stage retrieval (vector similarity)
        retrieved_docs = self.vector_store.similarity_search(
            query_embedding, top_k=top_k
        )
        retrieved_chunks = [doc for doc, _ in retrieved_docs]

        # Second-stage reranking
        reranked_docs = self.reranker.rerank(query, retrieved_chunks)

        # Return top reranked documents
        return [doc for doc, _ in reranked_docs[:rerank_top_k]]

    def generate_response(self, query: str, context: List[str]) -> str:
        """
        Generate a response based on the query and retrieved context.

        Args:
            query (str): The user's query
            context (List[str]): List of relevant document chunks

        Returns:
            Generated response
        """
        # Combine context into a single string
        context_str = "\n\n".join(
            [f"Context {i + 1}:\n{text}" for i, text in enumerate(context)]
        )

        # Create prompt with context
        prompt = f"""Answer the following question based on the provided context.
        
        Question: {query}
        
        Context:
        {context_str}
        
        Answer:"""

        # Generate response
        response = self.llm.invoke(prompt)
        return response.content

In [None]:
pdf_path = "./dataset/health supplements/1. dietary supplements - for whom.pdf"

In [None]:
rag = RAG(pdf_path)

In [None]:
query = "What is the main topic of this document?"

In [None]:
relevant_docs = rag.retrieve(query)

In [None]:
print(f"Retrieved {len(relevant_docs)} relevant chunks after reranking.")

In [None]:
response = rag.generate_response(query, relevant_docs)

In [None]:
print(f"=== Processing query: '{query}' ===")

response = rag.generate_response(query, relevant_docs)

print("\n=== Relevant Documents ===")
for i, doc in enumerate(relevant_docs[:3]):
    print(f"\nDocument {i + 1} (first 200 chars):")
    print(doc[:200] + "...")

print("\n=== Generated Response ===")
print(response)