In [None]:
import os
from typing import List, Tuple

import fitz
import numpy as np
from langchain_ollama import ChatOllama, OllamaEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract all text from a given PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text.
    """
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text() for page in doc)
        return text

    except Exception as e:
        raise RuntimeError(f"Failed to extract text from PDF: {e}")

In [None]:
pdf_path = "./dataset/health supplements/1. dietary supplements - for whom.pdf"

In [None]:
text = extract_text_from_pdf(pdf_path)

In [None]:
def chunk_text(text: str, chunk_size: int, overlap: int) -> List[int]:
    """
    Splits the given text into overlapping chunks.

    Args:
        text (str): The complete text to split.
        chunk_size (int): Number of characters per chunk.
        overlap (int): Number of overlapping characters between chunks.

    Returns:
        List[str]: A list of text chunks.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap

    return chunks

In [None]:
chunk_size = 300
chunk_overlap = 50

In [None]:
chunks = chunk_text(text, chunk_size, chunk_overlap)

In [None]:
def generate_embeddings(chunks: List[str]) -> np.ndarray:
    """
    Generates embeddings for a list of text chunks.

    Args:
        chunks (List[str]): List of text chunks.

    Returns:
        np.ndarray: Matrix of embeddings.
    """
    try:
        return np.array(embedding_client.embed_documents(chunks))

    except Exception as e:
        raise RuntimeError(f"Embedding generation failed: {e}")

In [None]:
embedding_client = OllamaEmbeddings(model="llama3.2:3b")

In [None]:
embeddings = generate_embeddings(chunks)

In [None]:
def retrieve_relevant_chunks(
    query: str,
    chunks: List[str],
    embeddings: np.ndarray,
    top_k: int = 3,
    context_window: int = 1,
) -> List[str]:
    """
    Performs semantic search and returns top-k relevant chunks with contextual neighbors.

    Args:
        query (str): User query.
        chunks (List[str]): Original text chunks.
        embeddings (np.ndarray): Precomputed embeddings.
        top_k (int): Number of each matches to retrieve.
        context_window (int): Number of neighbor chunks to include before and after.

    Returns:
        List[str]: Retrieved chunks with added context.
    """
    try:
        query_embedding = np.array(embedding_client.embed_query(query)).reshape(1, -1)
        similarities = cosine_similarity(query_embedding, embeddings).flatten()
        top_indices = similarities.argsort()[-top_k:][::-1]

        context_chunks = set()
        for idx in top_indices:
            for i in range(idx - context_window, idx + context_window + 1):
                if 0 <= i < len(chunks):
                    context_chunks.add(i)

        return [chunks[i] for i in sorted(context_chunks)]

    except Exception as e:
        raise RuntimeError(f"Context-aware search failed: {e}")

In [None]:
query = "What is the main idea of the document?"

In [None]:
relevant_chunks = retrieve_relevant_chunks(query, chunks, embeddings)

In [None]:
def format_context(chunks: List[str]) -> str:
    """
    Concatenates a list of chunks into a single string as context.

    Args:
        chunks (List[str]): List of text chunks.

    Returns:
        str: Concatenated context string.
    """
    return "\n\n".join(chunks)

In [None]:
context = format_context(relevant_chunks)

In [None]:
def generate_response(query: str, context: str) -> str:
    """
    Generates a response to a query using the retrieved context and the LLM.

    Args:
        query (str): User's input question.
        context (str): Relevant contextual information.

    Returns:
        str: Generated answer.
    """
    try:
        prompt = f"""You are a helpful assistant. Use the following context to answer the question:
    
        Context:
        {context}
        
        Question: {query}
        Answer:"""
        return chat_client.invoke(prompt).content
    except Exception as e:
        raise RuntimeError(f"Failed to generate response: {e}")

In [None]:
chat_client = ChatOllama(model="llama3.2:3b")

In [None]:
response = generate_response(query, context)

In [None]:
print(f"Response to '{query}':\n{response}")