In [None]:
### Library to install
# ! pip install --quiet langchain_experimental langchain_openai

#### Import the libraries and environment variable

In [None]:
# --- Environment Setup ---
import os
from dotenv import load_dotenv
load_dotenv()

# --- LangChain / AstraDB / Utils ---
import time
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_astradb import AstraDBVectorStore
# from langchain.retrievers.mmr import MMRRetriever

from langchain.llms import OpenAI
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain_core.documents import Document

#### 📕 2. Load PDF & Extract Text

In [4]:
from langchain_community.document_loaders import PyPDFLoader
loader= PyPDFLoader(r"C:\Suryavi\GenAI\KRISH_NAIK\AGENTIC_AI_2_0\Assignments\RAG\data\input\The Bhagavad Gita.pdf")


# --- Read Pages Asynchronously ---
# document loaders implement lazy_load and its async variant,
#  alazy_load, which return iterators of Document objects. We will use these below.
pages = []
async for page in loader.alazy_load():
    pages.append(page)

print("Total pages in the pdf - ", len(pages))
print("Sample metadata:", pages[0].metadata)
print("Sample content:\n", pages[0].page_content)

Total pages in the pdf -  447
Sample metadata: {'producer': 'doPDF Ver 7.2 Build 376 (Windows 7 Business Edition - Version: 6.1.7600 (x86))', 'creator': 'Adobe Acrobat 8.0 Combine Files', 'creationdate': '2023-09-28T13:07:43+05:30', 'moddate': '2023-09-28T13:07:43+05:30', 'source': 'C:\\Suryavi\\GenAI\\KRISH_NAIK\\AGENTIC_AI_2_0\\Assignments\\RAG\\data\\input\\The Bhagavad Gita.pdf', 'total_pages': 447, 'page': 0, 'page_label': '1'}
Sample content:
 i 
 
 
 
 
The Bhagavad Gita 
Based on HH Sri Raghavendra Teertha’s Gita Vivruti 
& 
Lectures by HH Sri Vidyasagara Madhava Teertha 
 
 
 
Compiled By 
Dr. Giridhar Boray 
 
 
 
 
 
 
 
TIRUMALA TIRUPATI DEVASTHANAMS  
TIRUPATI 
2023


##### Sematic Chunking: it splits into sentences, then groups into groups of 3 sentences, and then merges one that are similar in the embedding space. Semantic Chunking considers the relationships within the text. It divides the text into meaningful, semantically complete chunks. This approach ensures the information’s integrity during retrieval, leading to a more accurate and contextually appropriate outcome. It is slower compared to the previous chunking strategy there’s three different strategies you could use on Semantic Chunking):
###### 1- `percentile` (default) — In this method, all differences between sentences are calculated, and then any difference greater than the X percentile is split.
###### 2- `standard_deviation` — In this method, any difference greater than X standard deviations is split.
###### 3- `interquartile` — In this method, the interquartile distance is used to split chunks.

In [9]:
# --- Extract text & metadata ---
texts = [p.page_content for p in pages]
metadatas = [p.metadata for p in pages]

# --- Load Embeddings ---
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# --- Semantic Chunking ---
splitter = SemanticChunker(embeddings, breakpoint_threshold_type="interquartile")
docs = splitter.create_documents(texts, metadatas)
print("Total chunks:", len(docs))


Total chunks: 740


##### 🔗 4. Connect to Astra DB

In [None]:
ASTR_DB_ID = os.getenv("ASTR_DB_ID")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_KEYSPACE = os.getenv("ASTRA_DB_KEYSPACE")


vector_store = AstraDBVectorStore(
    collection_name="astra_vector_langchain",
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE,
)
# Upload chunks
vector_store.add_documents(docs)


['735ea9284cc043a99e0eecefaf28c054',
 'e525cd7c40ed4e599d5930406de2b040',
 '4e7f878b11f143b18e3d6f55cf281201',
 '508ccf275e464b3c821926386c38fb7b',
 '1d43dddcd99c458a813c1568a5ca33bd',
 'b867e9388b844c028e0b2a90f3056eea',
 '0c2a22eeda4c4bdea24b309808c3cdde',
 '446924141d504cf69f1a880376ebda6d',
 'c227d1f051c34d4abb6e5a24ae0e6004',
 'a868792f884144e8bc07970178854933',
 '6f14612615744e12b5642b34f997fef8',
 '1a4f0c85b24e4b9eb4b796ba71b57dd8',
 '66b2af4a8ea446d99f6dfd4a76314fd6',
 '69b9994a5b154165b41bc4e8d4758d0d',
 'd30a3231dace400e809a5f733c771612',
 'ecb9bdfb73244167b0935b572ab8a791',
 'c83f2d13c7d84f98ad45ba951fc68eba',
 '412446ecea104e019588961c1308254e',
 '5c3263f3cc994e52abd8624f68b4bb3c',
 '38b91d74550d4661a155277cc5d4cd28',
 '979992aa7c1746bf81b463c8ca791ba1',
 'bf653522aa5c4077a9d464fc1569d0d7',
 '897987a5fbcc44a89f79aaacff5d2510',
 '0133f34f23964a49a44fe0427d801126',
 '7531af50a4cc4cb2965432a57fbd7623',
 '48d34c49d7c44f768587442892283ccc',
 'a49e9b39780449cb91a5baf5b6fbff2c',
 

#### 🔍 5. Basic Semantic Retrieval

In [24]:
retriever = vector_store.as_retriever()
results = retriever.get_relevant_documents("Krishna is the Source of All Incarnations")
print("Sample result:\n", results[0].page_content)


  results = retriever.get_relevant_documents("Krishna is the Source of All Incarnations")


Sample result:
 Canto 1     9
strictest austerity, the best quality, the supreme 
dharma and the ultimate goal of life. ( 3 0 )  F r o m  
the beginning of the manifestation He, by His 
internal potency, has been the cause and effect of 
all forms and the transcendental Absolute of the 
modes of nature. (31) Although He, manifesting 
by the modes, having entered them appears to 
be affected by the modes, He is the full 
manifestation of all wisdom. (32) He, as the 
Supersoul, pervades all living beings as the source 
of creation like ﬁre does in wood and shines forth 
as different living entities, at the same time being 
the Absolute Person. (33) That Supersoul created 
the subtle senses in ﬂuenced by the modes of 
nature by entering the living beings in His own 
creation, causing them to enjoy those modes. (34) 
Thus He maintains all of them in the mode of 
goodness, being incarnated Himself in the 
performance of His  pastimes as the master of all 
the worlds of the divine, human and

#### 📌 6. Maximal Marginal Relevance (MMR) Retrieval

###### ✅ When to use: 
###### BM25: If you're doing basic keyword search (e.g., FAQs, simple search engines). 
###### MMR: If you use LLMs or semantic embeddings, and want more informative, diverse answers.

In [33]:
# # --- MMR Re-ranking ---
query = " When you feel angry which chapter to read? "
mmr_retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "lambda_mult": 0.5}
)

start = time.time()
retrieved_docs = mmr_retriever.get_relevant_documents(query)
print(f"\nMMR Retrieval Time: {time.time() - start:.2f}s")

print("\nMMR Top documents:")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"Doc {i}:", doc.page_content[:300], "...")  # print first 300 chars



MMR Retrieval Time: 5.01s

MMR Top documents:
Doc 1: References                                                                                                                                              15 
 
 
 
Contents 
Introduction - Background and Battleground                  ............. 1 
Chapter 1   - Arjuna’s Distress                     ...
Doc 2: Chapter 3                                                                                                                                         105 
 
Why Do People Sin? AOw©Z CdmM Ÿ& 
AW Ho$Z à`wº$mo@`§ nmn§ Ma{V nyéf… Ÿ& 
A{ZÀN>Þ{n dmîU}` ~bm{Xd {Z`mo{OV… Ÿ&& 36&& 
 
Arjuna said: O Krishna (from ...
Doc 3: Chapter 2                                                                                                                                      75 
  
intellect -> destruction. Forgetfulness could be of two types. One would be 
forgetting knowledge about avoiding activities prohibited in the scriptur ...
Doc 4: This chapt

#### 🤖 7. RetrievalQA Chain

#####  Langchain’s RetrievalQA class to quickly create a question-answering chain that uses:  A retriever to fetch relevant documents from a knowledge source (like your vector store). 'A language model chain to generate answers based on the retrieved documents.

In [26]:
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    retriever=vector_store.as_retriever()
)

response = qa.run("What is Vedic knowledge?")
print("\n[QA Answer]:", response)


  llm=OpenAI(),
  response = qa.run("What is Vedic knowledge?")



[QA Answer]:  Vedic knowledge refers to the knowledge and teachings found in the Vedas, which are ancient Hindu scriptures. These teachings contain information about various deities and spiritual practices, and are considered to be the means to gain knowledge about the Lord Almighty and attain eternal bliss. Vedic knowledge is divided into two types: ordinary knowledge (jnana), which is gained from studying scriptures, and wisdom or direct knowledge (vijnana), which is specialized knowledge about the Lord that can be attained after self-realization.


In [37]:
# ! pip install openai==0.28

#### 🧾 7. Manual Prompting with LLM

In [43]:
# --- Define prompt template ---
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

prompt_template = """
You are the Bhagavad Geeta assistant who answers questions based on the following documents.

Context:
{context}

Question:
{question}

Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"],
)

context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
formatted_prompt = prompt.format(context=context_text, question=query)

llm = OpenAI()

# Use generate method (note it takes a list of prompts)
result = llm.generate([formatted_prompt])

# Access the generated text:
response = result.generations[0][0].text

print("\n--- LLM Generated Answer ---")
print(response)



--- LLM Generated Answer ---
Chapter 2 would be most beneficial to read when feeling angry. This chapter discusses the causes of anger and how to control it through self-control and peace of mind. It also talks about the importance of controlling emotions like desire and aversion, which can lead to sinful activities. By understanding and practicing the teachings in this chapter, one can learn to manage their anger and overcome it.


#### 🤖 8. RetrievalQA Chain

#####  Langchain’s RetrievalQA class to quickly create a question-answering chain that uses:  A retriever to fetch relevant documents from a knowledge source (like your vector store). 'A language model chain to generate answers based on the retrieved documents.

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_store.as_retriever()
)

response = qa.run("What is Vedic knowledge?")
print("\n[QA Answer]:", response)


  llm=OpenAI(),
  response = qa.run("What is Vedic knowledge?")



[QA Answer]:  Vedic knowledge refers to the knowledge and teachings found in the Vedas, which are ancient Hindu scriptures. These teachings contain information about various deities and spiritual practices, and are considered to be the means to gain knowledge about the Lord Almighty and attain eternal bliss. Vedic knowledge is divided into two types: ordinary knowledge (jnana), which is gained from studying scriptures, and wisdom or direct knowledge (vijnana), which is specialized knowledge about the Lord that can be attained after self-realization.


#### 🔍 9. Similarity Search Example

In [45]:
results = vector_store.similarity_search(
    "Arjuna is in a state of distress and confusion about the upcoming battle, particularly due to the prospect of fighting his own relatives and friends", 
    k=2
)

for res in results:
    print(f"* {res.page_content}\n→ {res.metadata}")


* Arjuna requests Krishna 
to take the chariot to the middle of the battleground so that he can have a 
good look at both the armies. He recognizes his teachers, cousins, nephews, 
uncles, grand uncles, children, friends etc., and gets very depressed. He is 
distressed at the very thought of waging a war where he himself might kill 
many of his near and dear ones. Arjuna tells Krishna that his body is
→ {'producer': 'doPDF Ver 7.2 Build 376 (Windows 7 Business Edition - Version: 6.1.7600 (x86))', 'creator': 'Adobe Acrobat 8.0 Combine Files', 'creationdate': '2023-09-28T13:07:43+05:30', 'moddate': '2023-09-28T13:07:43+05:30', 'source': 'C:\\Suryavi\\GenAI\\KRISH_NAIK\\AGENTIC_AI_2_0\\Assignments\\RAG\\data\\input\\The Bhagavad Gita.pdf', 'total_pages': 447, 'page': 50, 'page_label': '51'}
* He was further seeing bad omens and felt 
there was nothing good coming out of this war. He says he does not crave 
for victory, or a kingdom won after losing so many near and dear ones 
(1.20-1.39).

#### 🔍 9. Similarity Search Score

In [46]:
results_with_scores = vector_store.similarity_search_with_score(
        "Arjuna is in a state of distress and confusion about the upcoming battle, particularly due to the prospect of fighting his own relatives and friends", 
         k=2
)

for i, (doc, score) in enumerate(results_with_scores, start=1):
    print(f"Result {i}:")
    print(f"Score (Similarity): {score:.4f}")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("-" * 50)


Result 1:
Score (Similarity): 0.8674
Content: Arjuna requests Krishna 
to take the chariot to the middle of the battleground so that he can have a 
good look at both the armies. He recognizes his teachers, cousins, nephews, 
uncles, grand uncles, children, friends etc., and gets very depressed. He is 
distressed at the very thought of waging a war where he himself might kill 
many of his near and dear ones. Arjuna tells Krishna that his body is
Metadata: {'producer': 'doPDF Ver 7.2 Build 376 (Windows 7 Business Edition - Version: 6.1.7600 (x86))', 'creator': 'Adobe Acrobat 8.0 Combine Files', 'creationdate': '2023-09-28T13:07:43+05:30', 'moddate': '2023-09-28T13:07:43+05:30', 'source': 'C:\\Suryavi\\GenAI\\KRISH_NAIK\\AGENTIC_AI_2_0\\Assignments\\RAG\\data\\input\\The Bhagavad Gita.pdf', 'total_pages': 447, 'page': 50, 'page_label': '51'}
--------------------------------------------------
Result 2:
Score (Similarity): 0.8365
Content: He was further seeing bad omens and felt 
there was 

In [None]:
from langchain_core.documents import Document
doc = docx.Document()
doc.add_heading("RAG Output", 0)
doc.add_paragraph(result)
doc.save("rag_output.docx")