In [None]:
import os
from typing import Dict, List, Optional, Tuple

import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.text_splitter import SemanticChunker
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
class SemanticChunking:
    """
    A class to implement Semantic Chunking.
    """

    def __init__(self, model_name: str = "llama3.2:3b"):
        """
        Initialize the SemanticChunking.

        Args:
            model_name: Name of the Ollama model to use
        """
        self.model_name = model_name
        self.embeddings = None
        self.vectorstore = None
        self.llm = None

    def setup_environment(self) -> None:
        """
        Set up the environment and initialize necessary components.
        """
        # Initialize the embedding model
        self.embeddings = OllamaEmbeddings(model=self.model_name)

        # Initialize the LLM for chat
        self.llm = ChatOllama(model=self.model_name, temperature=0.7)

        print("Environment setup complete with model:", self.model_name)

    def extract_text_from_pdf(self, file_path: str) -> List[Document]:
        """
        Extract text from a PDF file.

        Args:
            file_path: Path to the PDF file

        Returns:
            List of Document objects containing the extracted text

        Raises:
            FileNotFoundError: If the PDF file doesn't exist
            Exception: For other extraction errors
        """
        try:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"PDF file not found at {file_path}")

            loader = PyPDFLoader(file_path)
            documents = loader.load()
            print(f"Successfully extracted text from {file_path}")
            return documents
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            raise

    def calculate_similarity_differences(
        self, embeddings: List[List[float]]
    ) -> List[float]:
        """
        Calculate similarity differences between consecutive sentence embeddings.

        Args:
            embeddings: List of sentence embeddings

        Returns:
            List of similarity difference scores
        """
        similarities = []
        for i in range(1, len(embeddings)):
            # Calculate cosine similarity between consecutive embeddings
            cos_sim = np.dot(embeddings[i - 1], embeddings[i]) / (
                np.linalg.norm(embeddings[i - 1]) * np.linalg.norm(embeddings[i])
            )
            similarities.append(1 - cos_sim)  # Convert to difference

        return similarities

    def create_semantic_chunks(
        self, documents: List[Document], breakpoint_threshold: float = 0.2
    ) -> List[Document]:
        """
        Create semantic chunks from documents using similarity differences.

        Args:
            documents: List of Document objects to chunk
            breakpoint_threshold: Threshold for determining chunk boundaries

        Returns:
            List of semantically chunked Document objects
        """
        try:
            # First split into sentences using a standard text splitter
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=100,  # Small chunks for sentences
                chunk_overlap=0,
                separators=["\n\n", "\n", ". ", "? ", "! "],
            )
            sentences = text_splitter.split_documents(documents)

            # Get embeddings for each sentence
            sentence_texts = [s.page_content for s in sentences]
            sentence_embeddings = self.embeddings.embed_documents(sentence_texts)

            # Calculate similarity differences
            similarity_diffs = self.calculate_similarity_differences(
                sentence_embeddings
            )

            # Create semantic chunks based on similarity differences
            semantic_chunker = SemanticChunker(
                embeddings=self.embeddings,
                breakpoint_threshold_amount=breakpoint_threshold,
            )

            # Combine back into full text for semantic chunking
            full_text = "\n".join([d.page_content for d in documents])
            semantic_chunks = semantic_chunker.create_documents([full_text])

            print(f"Created {len(semantic_chunks)} semantic chunks")
            return semantic_chunks
        except Exception as e:
            print(f"Error in semantic chunking: {e}")
            raise

    def create_embeddings_for_chunks(self, chunks: List[Document]) -> FAISS:
        """
        Create embeddings for semantic chunks and store in a vector database.

        Args:
            chunks: List of Document objects to embed

        Returns:
            FAISS vector store containing the chunk embeddings
        """
        try:
            self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
            print("Created embeddings for semantic chunks")
            return self.vectorstore
        except Exception as e:
            print(f"Error creating embeddings: {e}")
            raise

    def perform_semantic_search(self, query: str, k: int = 3) -> List[Document]:
        """
        Perform semantic search on the stored chunks.

        Args:
            query: Search query
            k: Number of chunks to retrieve

        Returns:
            List of relevant Document objects

        Raises:
            ValueError: If vectorstore is not initialized
        """
        if self.vectorstore is None:
            raise ValueError(
                "Vector store not initialized. Call create_embeddings_for_chunks first."
            )

        try:
            relevant_chunks = self.vectorstore.similarity_search(query, k=k)
            print(f"Retrieved {len(relevant_chunks)} relevant chunks")
            return relevant_chunks
        except Exception as e:
            print(f"Error in semantic search: {e}")
            raise

    def generate_response(self, query: str, relevant_chunks: List[Document]) -> str:
        """
        Generate a response based on retrieved chunks.

        Args:
            query: User query
            relevant_chunks: List of relevant Document objects

        Returns:
            Generated response as a string
        """
        try:
            # Combine the chunks into context
            context = "\n\n".join([chunk.page_content for chunk in relevant_chunks])

            # Create a prompt template
            prompt = ChatPromptTemplate.from_template(
                """Answer the following question based only on the provided context:
                
                Context:
                {context}
                
                Question: {question}
                
                Answer in a clear and concise manner. If you don't know the answer, 
                simply say you don't know."""
            )

            # Create the chain
            chain = prompt | self.llm | StrOutputParser()

            # Generate the response
            response = chain.invoke({"question": query, "context": context})
            return response
        except Exception as e:
            print(f"Error generating response: {e}")
            raise

In [None]:
pdf_path = "./dataset/health supplements/1. dietary supplements - for whom.pdf"

In [None]:
rag = SemanticChunking(model_name="llama3.2:3b")

In [None]:
rag.setup_environment()

In [None]:
documents = rag.extract_text_from_pdf(pdf_path)

In [None]:
semantic_chunks = rag.create_semantic_chunks(documents)

In [None]:
rag.create_embeddings_for_chunks(semantic_chunks)

In [None]:
query = "What is the main topic of this document?"

In [None]:
relevant_chunks = rag.perform_semantic_search(query)

In [None]:
response = rag.generate_response(query, relevant_chunks)

In [None]:
print(f"Response to '{query}':\n{response}")