In [None]:
from typing import Dict, List, Optional, Tuple

import fitz
import numpy as np
from langchain_ollama import ChatOllama, OllamaEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class PDFTextExtractor:
    """
    A class to handle PDF text extraction using PyMuPDF.
    """

    def __init__(self, file_path: str):
        """
        Initialize the PDFTextExtractor with a file path.

        Args:
            file_path (str): Path to the PDF file
        """
        self.file_path = file_path

    def extract_text(self) -> str:
        """
        Extract all text from the PDF file.

        Returns:
            str: Extracted text from the PDF

        Raises:
            FileNotFoundError: If the PDF file doesn't exist
            Exception: For other extraction errors
        """
        try:
            full_text = []
            with fitz.open(self.file_path) as doc:
                for page in doc:
                    full_text.append(page.get_text())
            return "\n".join(full_text)
        except FileNotFoundError:
            raise FileNotFoundError(f"The file {self.file_path} was not found.")
        except Exception as e:
            raise Exception(f"An error occurred while extracting text: {str(e)}")

In [None]:
class TextChunker:
    """
    A class to handle text chunking with overlapping windows.
    """

    def __init__(self, chunk_size: int = 1000, overlap: int = 200):
        """
        Initialize the TextChunker with chunking parameters.

        Args:
            chunk_size (int): Size of each text chunk in characters
            overlap (int): Number of overlapping characters between chunks
        """
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_text(self, text: str) -> List[str]:
        """
        Split text into overlapping chunks.

        Args:
            text (str): The text to be chunked

        Returns:
            List[str]: List of text chunks
        """
        chunks = []
        start = 0
        end = self.chunk_size

        while start < len(text):
            chunks.append(text[start:end])
            start += self.chunk_size - self.overlap
            end = start + self.chunk_size

        return chunks

In [None]:
class VectorStore:
    """
    A simple in-memory vector store for text embeddings.
    """

    def __init__(self):
        """
        Initialize the VectorStore with empty storage.
        """
        self.embeddings = []
        self.texts = []

    def add_embeddings(self, embeddings: List[List[float]], texts: List[str]) -> None:
        """
        Add embeddings and their corresponding texts to the store.

        Args:
            embeddings (List[List[float]]): List of embedding vectors
            texts (List[str]): List of corresponding text chunks
        """
        self.embeddings.extend(embeddings)
        self.texts.extend(texts)

    def semantic_search(
        self, query_embedding: List[float], top_k: int = 3
    ) -> List[Tuple[str, float]]:
        """
        Perform semantic search using cosine similarity.

        Args:
            query_embedding (List[float]): The embedding of the query
            top_k (int): Number of top results to return

        Returns:
            List[Tuple[str, float]]: List of tuples containing text and similarity score
        """
        if not self.embeddings:
            return []

        # Convert to numpy arrays for efficient computation
        query_array = np.array(query_embedding).reshape(1, -1)
        embeddings_array = np.array(self.embeddings)

        # Calculate cosine similarities
        similarities = cosine_similarity(query_array, embeddings_array)[0]

        # Get top_k results
        top_indices = np.argsort(similarities)[-top_k:][::-1]

        return [(self.texts[i], similarities[i]) for i in top_indices]

In [None]:
class RAGSystem:
    """
    A simple RAG (Retrieval-Augmented Generation) system implementation.
    """

    def __init__(self, model_name: str = "llama3.2:3b"):
        """
        Initialize the RAG system with Ollama components.

        Args:
            model_name (str): Name of the Ollama model to use
        """
        self.embedding_model = OllamaEmbeddings(model=model_name)
        self.llm = ChatOllama(model=model_name)
        self.vector_store = VectorStore()

    def load_and_process_document(self, file_path: str) -> None:
        """
        Load a PDF document, extract text, chunk it, and generate embeddings.

        Args:
            file_path (str): Path to the PDF file
        """
        try:
            # Extract text from PDF
            extractor = PDFTextExtractor(file_path)
            text = extractor.extract_text()

            # Chunk the text
            chunker = TextChunker()
            chunks = chunker.chunk_text(text)

            # Generate embeddings for each chunk
            embeddings = self.embedding_model.embed_documents(chunks)

            # Store embeddings and texts
            self.vector_store.add_embeddings(embeddings, chunks)

        except Exception as e:
            print(f"Error processing document: {str(e)}")
            raise

    def retrieve_relevant_chunks(
        self, query: str, top_k: int = 3
    ) -> List[Tuple[str, float]]:
        """
        Retrieve relevant text chunks for a given query.

        Args:
            query (str): The user query
            top_k (int): Number of top results to return

        Returns:
            List[Tuple[str, float]]: List of relevant chunks with similarity scores
        """
        try:
            # Generate embedding for the query
            query_embedding = self.embedding_model.embed_query(query)

            # Perform semantic search
            return self.vector_store.semantic_search(query_embedding, top_k)
        except Exception as e:
            print(f"Error retrieving chunks: {str(e)}")
            return []

    def generate_response(self, query: str, context: List[str]) -> str:
        """
        Generate a response using the LLM with the provided context.

        Args:
            query (str): The user query
            context (List[str]): List of relevant context chunks

        Returns:
            str: The generated response
        """
        try:
            # Combine context chunks
            context_str = "\n\n".join(context)

            # Create the prompt
            prompt = f"""Use the following context to answer the question at the end.
            If you don't know the answer, just say you don't know, don't try to make up an answer.
            
            Context:
            {context_str}
            
            Question: {query}
            
            Answer:"""

            # Get response from LLM
            response = self.llm.invoke(prompt)
            return response.content
        except Exception as e:
            print(f"Error generating response: {str(e)}")
            return "I encountered an error while generating a response."

    def query(self, question: str, top_k: int = 3) -> str:
        """
        Complete RAG pipeline: retrieve relevant chunks and generate response.

        Args:
            question (str): The user question
            top_k (int): Number of chunks to retrieve

        Returns:
            str: The generated answer
        """
        # Retrieve relevant chunks
        relevant_chunks = self.retrieve_relevant_chunks(question, top_k)

        if not relevant_chunks:
            return "I couldn't find any relevant information to answer your question."

        # Extract just the text (without scores) for generation
        context_texts = [chunk[0] for chunk in relevant_chunks]

        # Generate response
        return self.generate_response(question, context_texts)

In [None]:
rag = RAGSystem(model_name="llama3.2:3b")

In [None]:
pdf_path = "./dataset/health supplements/1. dietary supplements - for whom.pdf"

In [None]:
rag.load_and_process_document(pdf_path)

In [None]:
query = "What is the main topic of this document?"

In [None]:
response = rag.query(query)
print(f"Response to '{query}':\n{response}")