In [1]:
pdf_path = 'data\\DVSTUDY_PAPER.pdf'

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
from langchain_core.documents import Document

In [3]:

loader = PyPDFLoader(pdf_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [4]:
async def load_all_pdfs(folder_path: str) -> list:
    pages = []
    pdf_files = Path(folder_path).rglob("*.pdf")
    
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        async for page in loader.alazy_load():
            pages.append(page)
    return pages

In [5]:
pages = await load_all_pdfs('data')

In [6]:
print(type(pages[0]))


<class 'langchain_core.documents.base.Document'>


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
separators = [
    "\n\n",              # Paragraphs
    "\n",                # New lines
    r"\.\s",             # Sentences
    r"(?:Fig\.|Table)\s\d+",  # Split around figures/tables
    r"\s{2,}",           # Double spaces (used in some PDFs)
    " ",                 # Words
    ""                   # Fallback
]

def text_splitter(pages:list[str], c_size: int, c_overlap: int) -> list:
    chunks = []
    if pages:
        try:
            splitter = RecursiveCharacterTextSplitter(chunk_size = c_size, separators=separators,chunk_overlap=c_overlap)
            chunks = splitter.split_documents(pages)
        except Exception as e:
            raise e
    return chunks


In [8]:
chunks = text_splitter(pages, 1000, 50)

In [9]:
def clean_text(text):
    return ' '.join(text.split())

cleaned_chunks = [Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) for doc in chunks]

In [10]:
print(type(cleaned_chunks))

<class 'list'>


In [11]:
import os
import hashlib
import pickle
from pathlib import Path
from typing import List
from langchain_core.documents import Document
from langchain_chroma import Chroma
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings

# --- Embedding wrapper using thenlper/gte-base ---
class GTEEmbeddings(Embeddings):
    def __init__(self, model_name: str = "thenlper/gte-small"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, convert_to_numpy=True).tolist()


# --- Utility: Hash documents for caching ---
def compute_documents_hash(documents: List[Document]) -> str:
    hasher = hashlib.sha256()
    for doc in documents:
        hasher.update(doc.page_content.encode("utf-8"))
    return hasher.hexdigest()


# --- Main function: Embed and cache ---
def embed_and_store_once(
    documents: List[Document],
    persist_dir: str = "embeddings",
    model_name: str = "thenlper/gte-small"
) -> Chroma:

    os.makedirs(persist_dir, exist_ok=True)
    hash_path = Path(persist_dir) / "hash.pkl"
    current_hash = compute_documents_hash(documents)

    # Check for previously stored hash
    if hash_path.exists():
        with open(hash_path, "rb") as f:
            saved_hash = pickle.load(f)
        if saved_hash == current_hash:
            print("🟢 Reusing existing ChromaDB vector store from 'embeddings/'")
            return Chroma(
                persist_directory=persist_dir,
                embedding_function=GTEEmbeddings(model_name),
                client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
            )

    # Embed and store if hash differs
    print("🔵 Generating new embeddings and storing in 'embeddings/'...")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=GTEEmbeddings(model_name),
        persist_directory=persist_dir,
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )

    # Save hash for reuse
    with open(hash_path, "wb") as f:
        pickle.dump(current_hash, f)

    return vectorstore



In [12]:
vectorstore = embed_and_store_once(cleaned_chunks)

🟢 Reusing existing ChromaDB vector store from 'embeddings/'


In [13]:
query = "Association index"
result = vectorstore.similarity_search(query, k=5)
result[0].page_content

'Association index = O – E σ where O is the observed co-occurrence of a species pair, E is the ex- pected co-occurrence of the pair and σ is the standard deviation of the expected co-occurrence of the species. The expected co-occurrence was calculated from randomizations on the species by flock presence absence matrix. Randomizations were set up in the following manner: Since we were interested in examining differences in flocks of different rich - ness values, we kept the number of flocks in each richness class in our expected data equal to the number of flocks in the observed data- set. The observed data matrix was randomized by holding the column totals (flock richness) constant and using the species occurrences as proportions. For each randomized matrix, we calculated a co-occur - rence value for every species pair. We performed 1000 iterations and'

# scripts/retriever.py

import os
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub  # Or any LLM model of your choice
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# === Configuration ===
PERSIST_DIR = os.path.join("embeddings")  # Path where Chroma DB is persisted
EMBEDDING_MODEL = "thenlper/gte-small"    # Pre-trained embedding model
LLM_MODEL = "google/flan-t5-base"         # Or any other LLM model
TOP_K = 5                                 # Number of top results to retrieve from vector store

def load_vectorstore(persist_dir: str = PERSIST_DIR, model_name: str = EMBEDDING_MODEL):
    """
    Loads the vector store from the specified directory and embedding model.
    """
    embedding_function = GTEEmbeddings(model_name=model_name)

    logger.info(f"Loading vector store from {persist_dir}...")
    vectorstore = Chroma(
        persist_directory=persist_dir,
        embedding_function=embedding_function,
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )
    return vectorstore


def retrieve_relevant_documents(query: str, vectorstore: Chroma, top_k: int = TOP_K):
    """
    Retrieves the top K most relevant documents for the provided query from the vector store.
    """
    logger.info(f"Retrieving top {top_k} most relevant documents for query: {query}")
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
    return retriever.get_relevant_documents(query)


def setup_llm_model(llm_model: str = LLM_MODEL):
    """
    Sets up the LLM model for question answering.
    """
    logger.info(f"Setting up LLM model: {llm_model}...")
    llm = HuggingFaceHub(repo_id=llm_model, model_kwargs={"temperature": 0.2, "max_length": 512})
    return llm


def qa_chain_setup(llm, retriever):
    """
    Set up the RetrievalQA chain which will use the retriever and LLM model.
    """
    logger.info("Setting up the RetrievalQA chain...")
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)


def retrieve_and_answer(query: str):
    """
    Full pipeline to retrieve relevant documents and answer the question using the LLM.
    """
    # Load the vector store
    vectorstore = load_vectorstore()

    # Retrieve relevant documents
    documents = retrieve_relevant_documents(query, vectorstore)

    if not documents:
        logger.warning("No relevant documents found for the query.")
        return "Sorry, I couldn't find any relevant information."

    # Set up LLM model and QA chain
    llm = setup_llm_model()
    qa_chain = qa_chain_setup(llm, vectorstore.as_retriever())

    # Use the chain to get the answer
    result = qa_chain(query)

    # Return the result along with source documents
    answer = result['result']
    sources = result['source_documents']
    
    # Format source document output (optional)
    source_texts = [f"Source {i+1}: {doc.page_content[:500]}..." for i, doc in enumerate(sources)]
    return answer, source_texts


if __name__ == "__main__":
    query = "What does the association index mean in network analysis?"
    answer= retrieve_and_answer(query)
    
    # Output answer and source context
    print("\n🧠 Answer:", answer)
    


In [37]:
from langchain_chroma import Chroma
from scripts.embedding import Embedder
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA

def load_vectorstore(persist_dir: str = "embeddings", model_name:str = "thenlper/gte-small"):
    embedding_fn = Embedder(model_name=model_name)
    
    vector_store = Chroma(persist_directory=persist_dir, embedding_function=embedding_fn,
                            client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False))
    
    return vector_store


def retrieve_relevant_documents(query:str, vector_store:Chroma, top_k:int = 5):
    retriever = vector_store.as_retriever(search_kwargs = {"k": top_k})
    
    return retriever.get_relevant_documents(query=query)


def setup_llm_model(llm_model:str = "gemma3"):
    llm = OllamaLLM(model=llm_model, model_kwargs={"temperature": 0.2})

    return llm


def retriever_chain_setup(llm, retriever):
    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff"  # Or another strategy like "map_reduce", if needed
    )


def retrieve_and_answer(query: str):
    
    vector_store = load_vectorstore()
    
    relevant_docs = retrieve_relevant_documents(query=query, vector_store=vector_store)
    
    llm = setup_llm_model("deepseek-r1")
    
    qa_chain = retriever_chain_setup(llm, vector_store.as_retriever())
    
    result = qa_chain.invoke(query)
    
    answer = result['result']
    sources = result['source_documents']
    
    return answer, sources

In [39]:
answer, sources = retrieve_and_answer("what is modularity and how to calculate it")

In [40]:
sources

[Document(id='6e0a2fe0-178c-4549-a1e2-29355960b179', metadata={'appligent': 'AppendPDF Pro 6.3 Linux 64 bit Aug 30 2019 Library 15.0.4', 'author': 'Priti Bangal, Hari Sridhar, Daizaburo Shizuka, Laura N. Vander Meiden, and Kartik Shankar', 'creationdate': '2021-12-13T12:20:46-08:00', 'creator': 'Appligent AppendPDF Pro 6.3', 'moddate': '2021-12-13T12:20:46-08:00', 'page': 9, 'page_label': '10', 'producer': 'Prince 12.5 (www.princexml.com)', 'source': 'data\\DVSTUDY_PAPER.pdf', 'title': 'Flock-species richness influences node importance and modularity in mixed-species flock networks', 'total_pages': 24}, page_content='We use unweighted networks for filtered associations. Hence, we use degree centrality as a measure of structural importance in this analysis. Therefore, there are multiple species with the same central- ity values in this category. Calculating modularity We ran a ‘community detection algorithm’ based on the Louvian method on the networks built using the meth - ods describe

In [41]:
answer

'<think>\nOkay, so I\'m trying to figure out what modularity means and how to calculate it. From the context provided, it seems like modularity has something to do with network analysis, particularly in ecology where they study species associations.\n\nFirst, I remember that networks are made up of nodes (which could be species here) connected by edges (which represent interactions between species). The term "modularity" comes up when discussing community detection algorithms. The Louvian method is mentioned, which optimizes modularity to find communities or clusters in the network.\n\nSo, modularity measures how well a network can be divided into these communities. It\'s like checking if within each group (community), there are more connections between species from that group than would be expected by chance. The higher the modularity score, the better separated the groups are from each other.\n\nNow, to calculate modularity: I think it involves comparing the actual number of edges in