In [20]:
pdf_path = 'data\\DVSTUDY.pdf'

In [21]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
from langchain_core.documents import Document

In [22]:

loader = PyPDFLoader(pdf_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [23]:
async def load_all_pdfs(folder_path: str) -> list:
    pages = []
    pdf_files = Path(folder_path).rglob("*.pdf")
    
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        async for page in loader.alazy_load():
            pages.append(page)
    return pages

In [24]:
pages = await load_all_pdfs('data')

In [25]:
print(type(pages[0]))


<class 'langchain_core.documents.base.Document'>


In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
separators = [
    "\n\n",              # Paragraphs
    "\n",                # New lines
    r"\.\s",             # Sentences
    r"(?:Fig\.|Table)\s\d+",  # Split around figures/tables
    r"\s{2,}",           # Double spaces (used in some PDFs)
    " ",                 # Words
    ""                   # Fallback
]

def text_splitter(pages:list[str], c_size: int, c_overlap: int) -> list:
    chunks = []
    if pages:
        try:
            splitter = RecursiveCharacterTextSplitter(chunk_size = c_size, separators=separators,chunk_overlap=c_overlap)
            chunks = splitter.split_documents(pages)
        except Exception as e:
            raise e
    return chunks


In [27]:
chunks = text_splitter(pages, 1000, 50)

In [28]:
def clean_text(text):
    return ' '.join(text.split())

cleaned_chunks = [Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) for doc in chunks]

In [29]:
print(type(cleaned_chunks))

<class 'list'>


In [30]:
import os
import hashlib
import pickle
from pathlib import Path
from typing import List
from langchain_core.documents import Document
from langchain_chroma import Chroma
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings

# --- Embedding wrapper using thenlper/gte-base ---
class GTEEmbeddings(Embeddings):
    def __init__(self, model_name: str = "thenlper/gte-small"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, convert_to_numpy=True).tolist()


# --- Utility: Hash documents for caching ---
def compute_documents_hash(documents: List[Document]) -> str:
    hasher = hashlib.sha256()
    for doc in documents:
        hasher.update(doc.page_content.encode("utf-8"))
    return hasher.hexdigest()


# --- Main function: Embed and cache ---
def embed_and_store_once(
    documents: List[Document],
    persist_dir: str = "embeddings",
    model_name: str = "thenlper/gte-small"
) -> Chroma:

    os.makedirs(persist_dir, exist_ok=True)
    hash_path = Path(persist_dir) / "hash.pkl"
    current_hash = compute_documents_hash(documents)

    # Check for previously stored hash
    if hash_path.exists():
        with open(hash_path, "rb") as f:
            saved_hash = pickle.load(f)
        if saved_hash == current_hash:
            print("🟢 Reusing existing ChromaDB vector store from 'embeddings/'")
            return Chroma(
                persist_directory=persist_dir,
                embedding_function=GTEEmbeddings(model_name),
                client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
            )

    # Embed and store if hash differs
    print("🔵 Generating new embeddings and storing in 'embeddings/'...")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=GTEEmbeddings(model_name),
        persist_directory=persist_dir,
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )

    # Save hash for reuse
    with open(hash_path, "wb") as f:
        pickle.dump(current_hash, f)

    return vectorstore



In [31]:
vectorstore = embed_and_store_once(cleaned_chunks)

🟢 Reusing existing ChromaDB vector store from 'embeddings/'


In [61]:
query = "Association index"
result = vectorstore.similarity_search(query, k=5)
result[0].page_content

'Association index = O – E σ where O is the observed co-occurrence of a species pair, E is the ex- pected co-occurrence of the pair and σ is the standard deviation of the expected co-occurrence of the species. The expected co-occurrence was calculated from randomizations on the species by flock presence absence matrix. Randomizations were set up in the following manner: Since we were interested in examining differences in flocks of different rich - ness values, we kept the number of flocks in each richness class in our expected data equal to the number of flocks in the observed data- set. The observed data matrix was randomized by holding the column totals (flock richness) constant and using the species occurrences as proportions. For each randomized matrix, we calculated a co-occur - rence value for every species pair. We performed 1000 iterations and'


import os
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub  # Or any LLM model of your choice
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


PERSIST_DIR = os.path.join("embeddings")  # Path where Chroma DB is persisted
EMBEDDING_MODEL = "thenlper/gte-small"    # Pre-trained embedding model
LLM_MODEL = "google/flan-t5-base"         # Or any other LLM model
TOP_K = 5                                 # Number of top results to retrieve from vector store

def load_vectorstore(persist_dir: str = PERSIST_DIR, model_name: str = EMBEDDING_MODEL):
    """
    Loads the vector store from the specified directory and embedding model.
    """
    embedding_function = GTEEmbeddings(model_name=model_name)

    logger.info(f"Loading vector store from {persist_dir}...")
    vectorstore = Chroma(
        persist_directory=persist_dir,
        embedding_function=embedding_function,
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )
    return vectorstore


def retrieve_relevant_documents(query: str, vectorstore: Chroma, top_k: int = TOP_K):
    """
    Retrieves the top K most relevant documents for the provided query from the vector store.
    """
    logger.info(f"Retrieving top {top_k} most relevant documents for query: {query}")
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
    return retriever.get_relevant_documents(query)


def setup_llm_model(llm_model: str = LLM_MODEL):
    """
    Sets up the LLM model for question answering.
    """
    logger.info(f"Setting up LLM model: {llm_model}...")
    llm = HuggingFaceHub(repo_id=llm_model, model_kwargs={"temperature": 0.2, "max_length": 512})
    return llm


def qa_chain_setup(llm, retriever):
    """
    Set up the RetrievalQA chain which will use the retriever and LLM model.
    """
    logger.info("Setting up the RetrievalQA chain...")
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)


def retrieve_and_answer(query: str):
    """
    Full pipeline to retrieve relevant documents and answer the question using the LLM.
    """
    vectorstore = load_vectorstore()

    documents = retrieve_relevant_documents(query, vectorstore)

    if not documents:
        logger.warning("No relevant documents found for the query.")
        return "Sorry, I couldn't find any relevant information."

    llm = setup_llm_model()
    qa_chain = qa_chain_setup(llm, vectorstore.as_retriever())

    result = qa_chain(query)

    answer = result['result']
    sources = result['source_documents']

    source_texts = [f"Source {i+1}: {doc.page_content[:500]}..." for i, doc in enumerate(sources)]
    return answer, source_texts


if __name__ == "__main__":
    query = "What does the association index mean in network analysis?"
    answer= retrieve_and_answer(query)
    
    print("\n🧠 Answer:", answer)
    


In [62]:
from langchain.schema import Document
from typing import List

class ChromaRetriever:
    def __init__(self, vectorstore: Chroma, k: int = 5):
        self.vectorstore = vectorstore
        self.k = k

    def retrieve(self, query: str) -> List[Document]:
        return self.vectorstore.similarity_search(query, k=self.k)


In [63]:
def build_prompt(query: str, documents: List[Document]) -> str:
    context = "\n\n".join(doc.page_content for doc in documents)
    prompt = f"""You are an expert research assistant.

Use the following context to answer the question as accurately as possible.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt


In [70]:
import ollama

def query_llama(prompt: str, model: str = "llama3.2") -> str:
    response = ollama.chat(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"]


In [71]:
def retrieve_and_answer(query: str) -> str:
    retriever = ChromaRetriever(vectorstore=vectorstore)
    documents = retriever.retrieve(query)
    prompt = build_prompt(query, documents)
    answer = query_llama(prompt)
    return answer


In [73]:
answer = retrieve_and_answer("what is association index")

In [74]:
answer

'The Association Index is a statistical measure used to quantify the level of association between two species in a flock. It is calculated as follows:\n\nAssociation Index = O – E σ\n\nWhere:\n- O is the observed co-occurrence of a species pair\n- E is the expected co-occurrence of the pair, which was calculated from randomizations on the species by flock presence absence matrix\n- σ is the standard deviation of the expected co-occurrence of the species.\n\nThis measure is used to filter out non-random associations in the network and retain only the significant or "important" interactions between species.'

In [None]:
# add to vector_store script to retrieve the vector store
from langchain_chroma import Chroma
from chromadb.config import Settings

from scripts.embedding import Embedder

def load_vectorstore(
    persist_dir: str = "embeddings",
    model_name: str = "thenlper/gte-small"
) -> Chroma:
    return Chroma(
        persist_directory=persist_dir,
        embedding_function=Embedder(model_name),
        client_settings=Settings(persist_directory=persist_dir, anonymized_telemetry=False)
    )

In [77]:
#retriever

class VectorstoreRetriever:
    def __init__(self, vectorstore, k: int = 5):
        self.vectorstore = vectorstore
        self.k = k

    def retrieve(self, query: str):
        return self.vectorstore.similarity_search(query, k=self.k)


In [78]:
# in script utils
def build_prompt(query: str, documents) -> str:
    context = "\n\n".join(doc.page_content for doc in documents)
    return f"""You are a helpful assistant.

Use the following context to answer the user's question.

Context:
{context}

Question:
{query}

Answer:"""

In [80]:
# in rag chain.py file

import ollama

class RAGChain:
    def __init__(self, vectorstore, model_name="llama3", k=5):
        self.retriever = VectorstoreRetriever(vectorstore, k)
        self.model_name = model_name

    def run(self, query: str) -> str:
        documents = self.retriever.retrieve(query)
        prompt = build_prompt(query, documents)
        response = ollama.chat(
            model=self.model_name,
            messages=[{"role": "user", "content": prompt}]
        )
        return response["message"]["content"]
