In [None]:
from typing import List, TypedDict, Union, Optional

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.embeddings import Embeddings
from langchain_chroma import Chroma
from langchain_core.messages import AIMessage, HumanMessage
from langchain_ollama import OllamaLLM

# 1. Document preparation

In [2]:
def prepare_documents(docs_folder: str = "./pdf_docs") -> List[Document]:
    """
    Load and split PDF documents from a specified directory.

    Args:
        docs_folder (str, optional): The file containing folder. Defaults to "./pdf_docs".

    Returns:
        List[Document]: A list of Document objects containing the text from the PDFs split into chunks.
    """

    loader = PyPDFDirectoryLoader(
        path=docs_folder,
        glob="**/*.pdf",
        silent_errors=True
    )

    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )

    chunks = text_splitter.split_documents(documents)

    return chunks

In [3]:
chunks = prepare_documents()

# 2. Vector store creation

In [4]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
)

In [5]:
def create_chroma_vector_store(documents: List[Document], embedding_model: Embeddings, persist_dir: str = "./chroma_db") -> Chroma:
    """
    Create a Chroma vector store from the provided documents and embedding model.

    Args:
        documents (List[Document]): List of Document objects to be stored in the vector store.
        embedding_model (Embeddings): The embedding model to be used for vectorization.
        dir (str, optional): Directory to persist the collection. Defaults to "./chroma_db".

    Returns:
        Chroma: _description_
    """
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_dir,
    )
    return vector_store

In [6]:
vector_store = create_chroma_vector_store(chunks, embedding_model)

# 3. Agent state definition

In [None]:
class AgentState(TypedDict):
    messages: List[Union[HumanMessage, AIMessage]]
    context: Optional[str]
    query: str
    retrieval_strategy: Optional[str]
    next_nodes: Optional[List[str]]

# 4. LLM model definition

In [None]:
llm = OllamaLLM(model="llama3")

# 5. Node implementations

### Orchestrator node

In [None]:
def orchestrator(state: AgentState) -> AgentState:
    """
    Orchestrator function to determine the next steps based on the user's query.

    Args:
        state (AgentState): The current state of the agent, including the user's query and context.

    Returns:
        AgentState: The updated state of the agent with the next nodes to process.
    """

    query = state["query"]

    analysis_prompt = f"""
    Analyze the following user question and determine the appropriate processing flow:

    QUESTION: {query}

    Choose from the following options:
    1. Factual information search
    2. Complex analysis
    3. Simple question
    """

    analysis_result = llm.invoke(analysis_prompt)

    if "1" in analysis_result:
        state["retrieval_strategy"] = "multi_step"
        state["next_nodes"] = ["keyword_analyzer", "document_retriever"]
    elif "2" in analysis_result:
        state["retrieval_strategy"] = "deep_analysis"
        state["next_nodes"] = ["keyword_analyzer", "document_retriever", "summarizer"]
    else:
        state["retrieval_strategy"] = "direct"
        state["next_nodes"] = ["response_generator"]

    state["context"] = f"Type of the question: {analysis_result.strip()}"
    return state