In [81]:
from langchain_ollama.embeddings import OllamaEmbeddings

# Use Ollama based Llama model from local machine
embeddings = OllamaEmbeddings(
    model="llama3.2",
    temperature=0
)


In [89]:
from langchain_chroma import Chroma

vector_store_from_client = Chroma(
    persist_directory="../../data/chroma4/",
    collection_name="doc_search_demo",
    embedding_function=embeddings,
)


In [83]:
import os

def fetch_files(directory: str) -> list[str] :
    file_paths = []

    for folder, subs, files in os.walk(directory):
        for filename in files:
            file_paths.append(os.path.abspath(os.path.join(folder, filename)))

    return file_paths

In [84]:
from langchain_ollama.llms import OllamaLLM
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate

def call_llm_model(prompt: str) :

    # LLM 
    llm = OllamaLLM(
        model="llama3.2",
        temperature=0
    )

    result = llm.invoke(prompt)
    return result


In [85]:

def create_contextual_chunks(chunks : list[str], document: str) -> list[str]:

    CONTEXTUAL_EMBEDDING_PROMPT = """
        Here is the chunk we want to situate within the whole document:
        <chunk>
        {chunk}
        </chunk>
 
        Here is the content of the whole document:
        <document>
        {document}
        </document>
 
        Please provide a short, succinct context to situate this chunk within the overall document to improve search retrieval. Respond only with the context.
    """

    contexual_chunks = []

    for chunk in chunks:
        prompt=CONTEXTUAL_EMBEDDING_PROMPT.format(chunk=chunk, document=document)
        context_chunk = call_llm_model(prompt)
        contexual_chunks.append(f"{chunk}\n{context_chunk}")

    return contexual_chunks
        


In [86]:
from uuid import uuid4


def persist_chunks_as_embeddings(documents):

    uuids = [str(uuid4()) for _ in range(len(documents))]

    vector_store_from_client.add_documents(documents=documents, ids= uuids)

In [87]:
from langchain_core.documents import Document

def construct_documents(chunks : list[str], metadata):
    chunked_docs = []

    for i, chunk in enumerate(chunks):
        metadata["chunk"] = i
        doc = Document(page_content=chunk, metadata=metadata)
        chunked_docs.append(doc)

    return chunked_docs

In [99]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

files = fetch_files("../../data/pdf")

# for file in files[1]:
loader = PyPDFLoader("../../data/pdf/Petra.pdf")
pages = loader.load()

for page in pages:
    token_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 260, chunk_overlap = 20)
    chunks = token_splitter.split_text(page.page_content)

    print(f"Chunk size for page {page.metadata.get("page")} on document {page.metadata.get("title")} is {len(chunks)}")
    
    contextual_chunks = create_contextual_chunks(chunks, page.page_content)
    persist_chunks_as_embeddings(construct_documents(contextual_chunks, page.metadata))
    




Chunk size for page 0 on document Petra - Wikipedia is 4
Chunk size for page 1 on document Petra - Wikipedia is 4
Chunk size for page 2 on document Petra - Wikipedia is 4
Chunk size for page 3 on document Petra - Wikipedia is 4
Chunk size for page 4 on document Petra - Wikipedia is 4
Chunk size for page 5 on document Petra - Wikipedia is 4
Chunk size for page 6 on document Petra - Wikipedia is 3
Chunk size for page 7 on document Petra - Wikipedia is 5
Chunk size for page 8 on document Petra - Wikipedia is 4
Chunk size for page 9 on document Petra - Wikipedia is 4
Chunk size for page 10 on document Petra - Wikipedia is 5
Chunk size for page 11 on document Petra - Wikipedia is 2
Chunk size for page 12 on document Petra - Wikipedia is 9
Chunk size for page 13 on document Petra - Wikipedia is 9
Chunk size for page 14 on document Petra - Wikipedia is 7
Chunk size for page 15 on document Petra - Wikipedia is 9
Chunk size for page 16 on document Petra - Wikipedia is 10
Chunk size for page 17 

In [105]:
# 
from langchain_ollama.llms import OllamaLLM
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


llm = OllamaLLM(
    model="llama3.2",
    temperature=0
)

retriever = vector_store_from_client.as_retriever()

system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use four sentence maximum and keep the answer concise. "
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)

chain = create_retrieval_chain(retriever, question_answer_chain)
chain.invoke({"input": "Who lived in the north of petra?"})

{'input': 'Who lived in the north of petra?',
 'context': [Document(id='b45a0a2f-4553-4f05-8508-2441e65fb799', metadata={'chunk': 3, 'creationdate': '2025-03-27T12:32:03+00:00', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/128.0.0.0 Safari/537.36', 'moddate': '2025-03-27T12:32:03+00:00', 'page': 4, 'page_label': '5', 'producer': 'Skia/PDF m128', 'source': '../../data/pdf/Petra.pdf', 'title': 'Petra - Wikipedia', 'total_pages': 21}, page_content='be the Isis-Tyche, Isis and Tyche being the Egyptian and Greek goddesses, respectively, of good fortune.[49]\n21st century\nLayout\nHellenistic architecture\nThis chunk discusses the Hellenistic architecture and features of Petra, including its tombs, facades, and notable structures such as the Treasury, which reflect the cultural influences of Greek culture on the Nabataeans.'),
  Document(id='4e06b6c4-ce69-4325-b6ac-f89cdc9502f1', metadata={'chunk': 3, 'creationdate': '2025-03-27T12:32:03+0