In [1]:
import os

In [None]:
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = "RAG"

In [5]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyCDT4PKxR1HbclHvSHzbNBUp_QoMvXh00s"

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001"
)


In [12]:
from langchain_google_genai import ChatGoogleGenerativeAI
model = ChatGoogleGenerativeAI(model = "gemini-1.5-flash", convert_system_message_to_human=True)

In [13]:
print(model.invoke("hi").content)

Hi there! How can I help you today?


In [14]:
from langchain.chains import create_retrieval_chain

In [15]:
from langchain.chains.combine_documents import create_stuff_documents_chain

In [16]:
from langchain_chroma import Chroma

In [17]:
from langchain_core.prompts import ChatPromptTemplate

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [19]:
from langchain_core.prompts import MessagesPlaceholder

In [20]:
!pip install PyPDF

Collecting PyPDF
  Downloading pypdf-5.6.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.6.0-py3-none-any.whl (304 kB)
Installing collected packages: PyPDF
Successfully installed PyPDF-5.6.0


In [23]:
from PyPDF2 import PdfReader

In [33]:
import os
from pypdf import PdfReader

def load_pdfs_from_folder(folder_path):
    all_texts = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            reader = PdfReader(pdf_path)

            full_text = ""
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    full_text += text
            all_texts.append(full_text)

    return all_texts


In [41]:
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import os

# Load multiple PDFs
def load_pdfs(folder):
    texts = []
    for file in os.listdir(folder):
        if file.endswith(".pdf"):
            reader = PdfReader(os.path.join(folder, file))
            full_text = ""
            for page in reader.pages:
                if page_text := page.extract_text():
                    full_text += page_text
            texts.append(full_text)
    return texts

# Convert to Document objects
def create_documents_from_texts(texts):
    return [
        Document(page_content=text, metadata={"source": f"doc_{i}"})
        for i, text in enumerate(texts)
    ]

# Chunking
folder_path = "pdf_docs"
texts = load_pdfs(folder_path)
docs = create_documents_from_texts(texts)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


In [42]:
splits

[Document(metadata={'source': 'doc_0'}, page_content='2025 Indian Premier League\nDates 22 March – 3 June 2025\nAdministrator(s) Board of Control for\nCricket in India\nCricket format Twenty20\nTournament\nformat(s)\nGroup stage and playoffs\nChampions Royal Challengers\nBengaluru (1st title)\nRunners-up Punjab Kings\nParticipants 10\nMatches 74\nMost valuable\nplayer\nSuryakumar Yadav\n(Mumbai Indians)\nMost runs Sai Sudharsan (Gujarat\nTitans) (759)\nMost wickets Prasidh Krishna (Gujarat\nTitans) (25)\nOfﬁcial website iplt20.com (http://iplt20.co\nm)\n2025 Indian Premier League\nThe 2025 Indian Premier League, also known as IPL 18 and\nbranded as TATA IPL 2025, was the 18th edition of the Indian Premier\nLeague. The tournament featured 10 teams competing in 74 matches. It\nbegan on 22 March and was held across 13 venues before being\nsuspended on 9 May due to the 2025 India and Pakistan crisis. The\nmatches resumed from 17 May across six venues, and the final was\nrescheduled from 25

In [43]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=gemini_embeddings,
)

In [44]:
retriever = vectorstore.as_retriever()

In [45]:
retriever

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x12a922710>, search_kwargs={})

In [46]:
system_prompt = (
    "You are an assistant for question answering tasks. "
    "Use the following pieces of retrieved context to answer the question "
    "If you don't know the answer, say that you don't know."
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

In [47]:
chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [48]:
question_answering_chain = create_stuff_documents_chain(model, chat_prompt)

In [49]:
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [50]:
rag_chain.invoke({"input": "how much runs did virat scored in ipl 2025?"})

{'input': 'how much runs did virat scored in ipl 2025?',
 'context': [Document(id='415532cc-003c-4fd2-8055-40640b03ee94', metadata={'source': 'doc_0'}, page_content='Umpires: Virender Sharma (Ind) and\nKannur Swaroopanand (Ind)\nLucknow Super Giants won the toss and elected to ﬁeld.\nCorbin Bosch (Mumbai Indians) made his IPL debut.[70]\nRoyal Challengers Bengaluru won the toss and elected to ﬁeld.\nJacob Bethell (Royal Challengers Bengaluru) made his IPL debut.[71]\nRajasthan Royals won the toss and elected to ﬁeld.\nKarim Janat (Gujarat Titans) made his IPL debut.[72]\nVaibhav Suryavanshi (Rajasthan Royals) became the youngest player to score a century in IPL history.[73]\nDelhi Capitals won the toss and elected to ﬁeld.\nPunjab Kings won the toss and elected to ﬁeld.\nYuzvendra Chahal (Punjab Kings) took his second hat-trick in IPL.[74]\nChennai Super Kings were eliminated as a result of this match.[75]\nRajasthan Royals won the toss and elected to ﬁeld.\nRajasthan Royals were elimi

In [51]:
rag_chain.invoke({"input": "what is big bang theory?"})

{'input': 'what is big bang theory?',
 'context': [Document(id='de7345f9-009e-4c94-a63b-4199b088cbb0', metadata={'source': 'doc_1'}, page_content="observations necessary for infrared,\nultraviolet, gamma-ray, and X-ray astronomy.\nPhysical cosmology is the study of the formation and evolution of the universe on its largest scales.\nAlbert Einstein's theory of relativity plays a central role in all modern cosmological theories. In the\nearly 20th century, Hubble's discovery that the universe is expanding, as shown by the Hubble\ndiagram, prompted rival explanations known as the steady state universe and the Big Bang.\nAstrophysics\n06/06/2025, 20:10 Physics - Wikipedia\nhttps://en.wikipedia.org/wiki/Physics 13/26The Big Bang was confirmed by the success of Big Bang nucleosynthesis and the discovery of the\ncosmic microwave background in 1964. The Big Bang model rests on two theoretical pillars: Albert\nEinstein's general relativity and the cosmological principle. Cosmologists have recen

In [52]:
rag_chain.invoke({"input": "provide 4 questions of general knowledge with their answers"})

{'input': 'provide 4 questions of general knowledge with their answers',
 'context': [Document(id='1c84439a-c0f5-459b-8b72-be162ce7686c', metadata={'source': 'doc_2'}, page_content='and Inheritance”?         \n   Former US President Barrack Obama \n120) Who is the present Central Minister of External Affairs?  \n   Smt. Sushma Swaraj \n121) When is the World Literacy Day Celebrated?    \n   8th September \n122) ‘A Forgotten Empire’ written by the Historian Sewell refers to \nWhich Empire.          \n   Vijayanagara Empire \n123) Which is the Highest Peak in India?      \n   Godwin Austin \n124) What is the expansion of BARC?      \n   Bhabha Atomic Research Centre \n125) When is the National Sports Day Celebrated?    \n   29th August \n126) Which is the National Sport of China?     \n   Table Tennis \n127) What is the name of playing area of Baseball?    \n   Diamond (or Baseball Park) \n128) Which City is known as the “Garden City”?    \n   Bangalore 129) Where is the Vijaya Vittala T

In [53]:
mcq_prompt = """
You are an expert question generator.

Given the following context, generate {num_questions} multiple-choice questions (MCQs). Follow these rules:
- Each question should be directly based on the context.
- Provide 4 options labeled A to D.
- Clearly state the correct answer with "Answer: <option letter>".
- Keep questions clear, factual, and non-repetitive.

### Context:
{context}

### Output format:
1. <Question text>
A. <Option A>
B. <Option B>
C. <Option C>
D. <Option D>
Answer: <Correct Option>

Generate the questions now.
"""


In [67]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

prompt = PromptTemplate(
    input_variables=["context", "num_questions"],
    template=mcq_prompt
)

llm_chain = LLMChain(prompt=prompt, llm=model)

In [68]:
context_text = "about ipl 2025"

response = llm_chain.invoke({
    "context": context_text,
    "num_questions": 5,
})

In [74]:
print(response["text"])

Since the provided context "about ipl 2025" is extremely vague, I will have to make assumptions to create meaningful multiple-choice questions.  These questions will focus on plausible aspects of a future IPL season, acknowledging that the actual details are unknown.

1.  Which of the following is MOST likely to be a significant change in the IPL 2025 season compared to previous seasons?
    A. A reduction in the number of teams participating.
    B. The introduction of a new innovative rule change impacting gameplay.
    C. A significant decrease in the overall prize money.
    D. The elimination of the playoffs entirely.
    Answer: B

2.  Considering recent trends, which aspect of IPL 2025 is likely to experience the most substantial growth?
    A.  Attendance at stadium matches.
    B.  Viewership on traditional television.
    C.  Digital viewership and engagement through streaming platforms.
    D.  Sponsorship revenue from local businesses.
    Answer: C

3.  What is a plausible

In [75]:
context_text = "about black hole"

response = llm_chain.invoke({
    "context": context_text,
    "num_questions": 3,
})

In [76]:
print(response["text"])

1. What is a black hole?
A. A type of star that is extremely hot and bright.
B. A region of spacetime where gravity is so strong that nothing, not even light, can escape.
C. A massive collection of gas and dust in space.
D. A collapsed star that emits powerful bursts of radiation.
Answer: B

2.  Which of the following properties is primarily responsible for the immense gravitational pull of a black hole?
A. Its extremely high temperature.
B. Its rapid rotation.
C. Its immense density.
D. Its strong magnetic field.
Answer: C

3. What is the point of no return for a black hole called?
A. The event horizon.
B. The singularity.
C. The accretion disk.
D. The Schwarzschild radius.
Answer: A
