In [1]:
pdf_path = 'data\\DVSTUDY_PAPER.pdf'

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
from langchain_core.documents import Document

In [3]:

loader = PyPDFLoader(pdf_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [4]:
async def load_all_pdfs(folder_path: str) -> list:
    pages = []
    pdf_files = Path(folder_path).rglob("*.pdf")
    
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        async for page in loader.alazy_load():
            pages.append(page)
    return pages

In [5]:
pages = await load_all_pdfs('data')

In [6]:
print(type(pages[0]))


<class 'langchain_core.documents.base.Document'>


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
separators = [
    "\n\n",              # Paragraphs
    "\n",                # New lines
    r"\.\s",             # Sentences
    r"(?:Fig\.|Table)\s\d+",  # Split around figures/tables
    r"\s{2,}",           # Double spaces (used in some PDFs)
    " ",                 # Words
    ""                   # Fallback
]

def text_splitter(pages:list[str], c_size: int, c_overlap: int) -> list:
    chunks = []
    if pages:
        try:
            splitter = RecursiveCharacterTextSplitter(chunk_size = c_size, separators=separators,chunk_overlap=c_overlap)
            chunks = splitter.split_documents(pages)
        except Exception as e:
            raise e
    return chunks


In [8]:
chunks = text_splitter(pages, 1000, 50)

In [9]:
def clean_text(text):
    return ' '.join(text.split())

cleaned_chunks = [Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) for doc in chunks]

In [10]:
print(type(cleaned_chunks))

<class 'list'>


In [11]:
import os
import hashlib
import pickle
from pathlib import Path
from typing import List
from langchain_core.documents import Document
from langchain_chroma import Chroma
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings

# --- Embedding wrapper using thenlper/gte-base ---
class GTEEmbeddings(Embeddings):
    def __init__(self, model_name: str = "thenlper/gte-small"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, convert_to_numpy=True).tolist()


# --- Utility: Hash documents for caching ---
def compute_documents_hash(documents: List[Document]) -> str:
    hasher = hashlib.sha256()
    for doc in documents:
        hasher.update(doc.page_content.encode("utf-8"))
    return hasher.hexdigest()


# --- Main function: Embed and cache ---
def embed_and_store_once(
    documents: List[Document],
    persist_dir: str = "embeddings",
    model_name: str = "thenlper/gte-small"
) -> Chroma:

    os.makedirs(persist_dir, exist_ok=True)
    hash_path = Path(persist_dir) / "hash.pkl"
    current_hash = compute_documents_hash(documents)

    # Check for previously stored hash
    if hash_path.exists():
        with open(hash_path, "rb") as f:
            saved_hash = pickle.load(f)
        if saved_hash == current_hash:
            print("🟢 Reusing existing ChromaDB vector store from 'embeddings/'")
            return Chroma(
                persist_directory=persist_dir,
                embedding_function=GTEEmbeddings(model_name),
                client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
            )

    # Embed and store if hash differs
    print("🔵 Generating new embeddings and storing in 'embeddings/'...")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=GTEEmbeddings(model_name),
        persist_directory=persist_dir,
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )

    # Save hash for reuse
    with open(hash_path, "wb") as f:
        pickle.dump(current_hash, f)

    return vectorstore



In [12]:
vectorstore = embed_and_store_once(cleaned_chunks)

🟢 Reusing existing ChromaDB vector store from 'embeddings/'


In [13]:
query = "Association index"
result = vectorstore.similarity_search(query, k=5)
result[0].page_content

'Association index = O – E σ where O is the observed co-occurrence of a species pair, E is the ex- pected co-occurrence of the pair and σ is the standard deviation of the expected co-occurrence of the species. The expected co-occurrence was calculated from randomizations on the species by flock presence absence matrix. Randomizations were set up in the following manner: Since we were interested in examining differences in flocks of different rich - ness values, we kept the number of flocks in each richness class in our expected data equal to the number of flocks in the observed data- set. The observed data matrix was randomized by holding the column totals (flock richness) constant and using the species occurrences as proportions. For each randomized matrix, we calculated a co-occur - rence value for every species pair. We performed 1000 iterations and'

# scripts/retriever.py

import os
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub  # Or any LLM model of your choice
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# === Configuration ===
PERSIST_DIR = os.path.join("embeddings")  # Path where Chroma DB is persisted
EMBEDDING_MODEL = "thenlper/gte-small"    # Pre-trained embedding model
LLM_MODEL = "google/flan-t5-base"         # Or any other LLM model
TOP_K = 5                                 # Number of top results to retrieve from vector store

def load_vectorstore(persist_dir: str = PERSIST_DIR, model_name: str = EMBEDDING_MODEL):
    """
    Loads the vector store from the specified directory and embedding model.
    """
    embedding_function = GTEEmbeddings(model_name=model_name)

    logger.info(f"Loading vector store from {persist_dir}...")
    vectorstore = Chroma(
        persist_directory=persist_dir,
        embedding_function=embedding_function,
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )
    return vectorstore


def retrieve_relevant_documents(query: str, vectorstore: Chroma, top_k: int = TOP_K):
    """
    Retrieves the top K most relevant documents for the provided query from the vector store.
    """
    logger.info(f"Retrieving top {top_k} most relevant documents for query: {query}")
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
    return retriever.get_relevant_documents(query)


def setup_llm_model(llm_model: str = LLM_MODEL):
    """
    Sets up the LLM model for question answering.
    """
    logger.info(f"Setting up LLM model: {llm_model}...")
    llm = HuggingFaceHub(repo_id=llm_model, model_kwargs={"temperature": 0.2, "max_length": 512})
    return llm


def qa_chain_setup(llm, retriever):
    """
    Set up the RetrievalQA chain which will use the retriever and LLM model.
    """
    logger.info("Setting up the RetrievalQA chain...")
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)


def retrieve_and_answer(query: str):
    """
    Full pipeline to retrieve relevant documents and answer the question using the LLM.
    """
    # Load the vector store
    vectorstore = load_vectorstore()

    # Retrieve relevant documents
    documents = retrieve_relevant_documents(query, vectorstore)

    if not documents:
        logger.warning("No relevant documents found for the query.")
        return "Sorry, I couldn't find any relevant information."

    # Set up LLM model and QA chain
    llm = setup_llm_model()
    qa_chain = qa_chain_setup(llm, vectorstore.as_retriever())

    # Use the chain to get the answer
    result = qa_chain(query)

    # Return the result along with source documents
    answer = result['result']
    sources = result['source_documents']
    
    # Format source document output (optional)
    source_texts = [f"Source {i+1}: {doc.page_content[:500]}..." for i, doc in enumerate(sources)]
    return answer, source_texts


if __name__ == "__main__":
    query = "What does the association index mean in network analysis?"
    answer= retrieve_and_answer(query)
    
    # Output answer and source context
    print("\n🧠 Answer:", answer)
    


In [14]:
from langchain_chroma import Chroma
from scripts.embedding import Embedder
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA

def load_vectorstore(persist_dir: str = "embeddings", model_name:str = "thenlper/gte-small"):
    embedding_fn = Embedder(model_name=model_name)
    
    vector_store = Chroma(persist_directory=persist_dir, embedding_function=embedding_fn,
                            client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False))
    
    return vector_store


def retrieve_relevant_documents(query:str, vector_store:Chroma, top_k:int = 5):
    retriever = vector_store.as_retriever(search_kwargs = {"k": top_k})
    
    return retriever.get_relevant_documents(query=query)


def setup_llm_model(llm_model:str = "gemma3"):
    llm = OllamaLLM(model=llm_model, model_kwargs={"temperature": 0.2})

    return llm


def retriever_chain_setup(llm, retriever):
    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff"  # Or another strategy like "map_reduce", if needed
    )


def retrieve_and_answer(query: str):
    
    vector_store = load_vectorstore()
    
    relevant_docs = retrieve_relevant_documents(query=query, vector_store=vector_store)
    
    llm = setup_llm_model("deepseek-r1")
    
    qa_chain = retriever_chain_setup(llm, vector_store.as_retriever())
    
    result = qa_chain.invoke(query)
    
    answer = result['result']
    sources = result['source_documents']
    
    return answer, sources

In [15]:
answer, sources = retrieve_and_answer("what is modularity and how to calculate it")

  return retriever.get_relevant_documents(query=query)


In [16]:
sources

[Document(id='6e0a2fe0-178c-4549-a1e2-29355960b179', metadata={'appligent': 'AppendPDF Pro 6.3 Linux 64 bit Aug 30 2019 Library 15.0.4', 'author': 'Priti Bangal, Hari Sridhar, Daizaburo Shizuka, Laura N. Vander Meiden, and Kartik Shankar', 'creationdate': '2021-12-13T12:20:46-08:00', 'creator': 'Appligent AppendPDF Pro 6.3', 'moddate': '2021-12-13T12:20:46-08:00', 'page': 9, 'page_label': '10', 'producer': 'Prince 12.5 (www.princexml.com)', 'source': 'data\\DVSTUDY_PAPER.pdf', 'title': 'Flock-species richness influences node importance and modularity in mixed-species flock networks', 'total_pages': 24}, page_content='We use unweighted networks for filtered associations. Hence, we use degree centrality as a measure of structural importance in this analysis. Therefore, there are multiple species with the same central- ity values in this category. Calculating modularity We ran a ‘community detection algorithm’ based on the Louvian method on the networks built using the meth - ods describe

In [17]:
answer

"<think>\nOkay, I need to understand what modularity is and how it's calculated based on the provided context. Let me start by reading through the given information carefully.\n\nThe context talks about network analysis in ecology, specifically looking at mixed-species flocks. They're using a method called the Louvian algorithm for community detection, which aims to optimize modularity. Modularity measures the separation between clusters or communities in a network, with higher values indicating better-defined groups where there are more connections within groups than between them.\n\nThe context also mentions that modularity is calculated based on edges within versus across clusters. They start by assigning each vertex (node) to a unique cluster and then iteratively reassign nodes to maximize the modularity score until it can't be increased further.\n\nNow, I need to break this down step by step to explain how modularity works and how it's computed.\n\nFirst, what is a network in this

In [18]:
def local_vector_search(query:str) -> str:
    docs = retrieve_relevant_documents(query, vectorstore)
    
    return"\n\n".join([doc.page_content for doc in docs])

In [21]:
from langchain.tools import Tool
vector_store_tool = Tool(
    name="LocalDocumentResearcher",
    func=local_vector_search,
    description="Access local research PDF Files to find relevant information to the query"
)

In [None]:
from langchain.agents import initialize_agent
from langchain.agents.agent_types import AgentType

llm = setup_llm_model()

agent = initialize_agent(
    tools = [vector_store_tool],
    llm = llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose = True
)

  agent = initialize_agent(


In [38]:
from langgraph.graph import StateGraph
from langgraph.prebuilt import create_react_agent
from langchain_ollama import ChatOllama
from typing import TypedDict

# Define the state schema
class AgentState(TypedDict):
    input: str
    output: str

# Initialize the LLM
llm2 = ChatOllama(model='llama3.2', temperature=0)

# Define the tools used by the agent (ensure your vector_store_tool is correctly set up)
tools = [vector_store_tool]

# Create the agent node
agent_node = create_react_agent(llm2, tools)

# Initialize the state graph with the defined state schema
graph = StateGraph(state_schema=AgentState)

# Add the agent node to the graph
graph.add_node("agent", agent_node)

# Set the entry and finish points
graph.set_entry_point("agent")
graph.set_finish_point("agent")

# Compile the graph into an executable agent
executable_agent = graph.compile()

# Invoke the agent with an input
response = executable_agent.invoke({"input": "Explain the association index in network analysis."})

# Print the response to check output
print(response)


{'input': 'Explain the association index in network analysis.'}


In [39]:
answer = llm2.invoke("what is an association index")

In [40]:
print(answer)

content="An association index, also known as a co-occurrence matrix or a correlation matrix, is a mathematical table that shows the frequency of co-occurrences between two variables. It's a way to visualize and analyze the relationships between different variables in a dataset.\n\nIn general, an association index has the following characteristics:\n\n1. **Rows**: Represent one variable (e.g., a feature or attribute).\n2. **Columns**: Represent another variable (e.g., another feature or attribute).\n3. **Cells**: Contain the frequency of co-occurrences between the corresponding row and column variables.\n\nFor example, if we have two variables, X and Y, an association index might look like this:\n\n|  | Y=0 | Y=1 |\n| --- | --- | --- |\n| X=0 | 10 | 2 |\n| X=1 | 5 | 8 |\n\nIn this example, the cell at row X=0 and column Y=0 contains a value of 10, indicating that there are 10 observations where both X and Y have values of 0. Similarly, the cell at row X=1 and column Y=1 contains a value

In [41]:
tooled_llm = ChatOllama(model='llama3.2', temperature=0.2).bind_tools([vector_store_tool])

In [44]:
response = tooled_llm.invoke("find information about association index")

In [46]:
print(response.tool_calls)

[{'name': 'LocalDocumentResearcher', 'args': {'__arg1': 'association index'}, 'id': '0e4c6d5a-f6f4-4e40-bea2-56fbedf00b01', 'type': 'tool_call'}]


In [47]:
from typing import TypedDict, Annotated

tools = [vector_store_tool]

agent_node = create_react_agent(tooled_llm, tools)

# --- Define the state schema for the graph ---
class AgentState(TypedDict):
    input: str
    output: Annotated[str, None]  # Output will be filled later

# --- Build the LangGraph ---
graph = StateGraph(AgentState)

graph.add_node("agent", agent_node)
graph.set_entry_point("agent")
graph.set_finish_point("agent")

# --- Compile the graph ---
executable_agent = graph.compile()

# --- Run the agent ---
response = executable_agent.invoke({"input": "Explain the association index in network analysis."})


In [48]:
response

{'input': 'Explain the association index in network analysis.'}

In [54]:
llm = ChatOllama(model="llama3", temperature=0)

tools = [vector_store_tool]

# Base agent
base_agent = create_react_agent(llm, tools)

# Define your state
class AgentState(TypedDict):
    input: str
    output: Annotated[str, None]

# Wrap agent so it returns 'output'
async def agent_node(state: AgentState) -> AgentState:
    result = await base_agent.ainvoke(state["input"])
    
    # Debugging step
    print(f"[agent_node] Raw result from base_agent: {result}")

    # Handle result
    if hasattr(result, "content"):
        output_content = result.content
    else:
        output_content = result  # Assume plain text if no .content

    return {
        "input": state["input"],
        "output": output_content
    }
# Build LangGraph
graph = StateGraph(AgentState)
graph.add_node("agent", agent_node)
graph.set_entry_point("agent")
graph.set_finish_point("agent")
executable_agent = graph.compile()

# Run asynchronously
response = await executable_agent.ainvoke({"input": "Explain the association index in network analysis."})

# Show output
print("\nFinal response:")
print(response["output"])

InvalidUpdateError: Expected dict, got Explain the association index in network analysis.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_GRAPH_NODE_RETURN_VALUE