In [1]:
import os

In [12]:
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

In [14]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = "RAG"

In [15]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [16]:
# os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=GOOGLE_API_KEY
)


In [19]:
from langchain_google_genai import ChatGoogleGenerativeAI
model = ChatGoogleGenerativeAI(model = "gemini-1.5-flash", convert_system_message_to_human=True)

In [20]:
print(model.invoke("hi").content)

Hi there! How can I help you today?


In [21]:
from langchain.chains import create_retrieval_chain

In [22]:
from langchain.chains.combine_documents import create_stuff_documents_chain

In [24]:
from langchain_chroma import Chroma

In [25]:
from langchain_core.prompts import ChatPromptTemplate

In [26]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [27]:
from langchain_core.prompts import MessagesPlaceholder

In [28]:
from PyPDF2 import PdfReader

In [32]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import os

# Load multiple PDFs
def load_pdfs(folder):
    texts = []
    for file in os.listdir(folder):
        if file.endswith(".pdf"):
            reader = PdfReader(os.path.join(folder, file))
            full_text = ""
            for page in reader.pages:
                if page_text := page.extract_text():
                    full_text += page_text
            texts.append(full_text)
    return texts

# Convert to Document objects
def create_documents_from_texts(texts):
    return [
        Document(page_content=text, metadata={"source": f"doc_{i}"})
        for i, text in enumerate(texts)
    ]

# Chunking
folder_path = "../pdf_docs"
texts = load_pdfs(folder_path)
docs = create_documents_from_texts(texts)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


In [56]:
splits[:1]

[Document(metadata={'source': 'doc_0'}, page_content='2025 Indian Premier League\nDates 22 March – 3 June 2025\nAdministrator(s)Board of Control for\nCricket in India\nCricket format Twenty20\nTournament\nformat(s)Group stage and playoffs\nChampions Royal Challengers\nBengaluru (1st title)\nRunners-up Punjab Kings\nParticipants 10\nMatches 74\nMost valuable\nplayerSuryakumar Yadav\n(Mumbai Indians)\nMost runs Sai Sudharsan (Gujarat\nTitans) (759)\nMost wickets Prasidh Krishna (Gujarat\nTitans) (25)\nOfﬁcial website iplt20.com (http://iplt20.co\nm)\n2025 Indian Premier League\nThe 2025 Indian Premier League , also known as IPL 18 and\nbranded as TATA IPL 2025 , was the 18th edition of the Indian Premier\nLeague . The tournament featured 10 teams  competing in 74 matches. It\nbegan on 22 March and was held across 13 venues before being\nsuspended on 9 May due to the 2025 India and Pakistan crisis . The\nmatches resumed from 17 May across six venues, and the final  was\nrescheduled from 2

In [34]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=gemini_embeddings,
)

In [35]:
retriever = vectorstore.as_retriever()

In [36]:
retriever

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1241783d0>, search_kwargs={})

In [37]:
system_prompt = (
    "You are an assistant for question answering tasks. "
    "Use the following pieces of retrieved context to answer the question "
    "If you don't know the answer, say that you don't know."
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

In [38]:
chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [39]:
question_answering_chain = create_stuff_documents_chain(model, chat_prompt)

In [40]:
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [45]:
rag_chain.invoke({"input": "how much runs did virat scored in final match of ipl 2025?"})["answer"]

'Virat Kohli scored 43 runs in the final match of IPL 2025.  Royal Challengers Bangalore won the match by 6 runs.  The match was played on June 3, 2025.'

In [46]:
rag_chain.invoke({"input": "what is big bang theory?"})["answer"]

"The Big Bang theory is a cosmological model for the universe's evolution.  It's supported by Big Bang nucleosynthesis and the discovery of the cosmic microwave background radiation.  The theory rests on Einstein's general relativity and the cosmological principle."

In [47]:
print(rag_chain.invoke({"input": "provide 4 questions of general knowledge with their answers"})["answer"])

Here are four general knowledge questions with their answers, based on the provided text:

1. **Question:** What is the name of the instrument used to measure the distance traveled in vehicles?  **Answer:** Odometer.
2. **Question:** Who wrote the book "Dreams from My Father: A Story of Race and Inheritance"? **Answer:** Former US President Barack Obama.
3. **Question:** When was penicillin discovered? **Answer:** In the year 1928.
4. **Question:** Which city is known as the "Windy City"? **Answer:** Chicago.


In [48]:
mcq_prompt = """
You are an expert question generator.

Given the following context, generate {num_questions} multiple-choice questions (MCQs). Follow these rules:
- Each question should be directly based on the context.
- Provide 4 options labeled A to D.
- Clearly state the correct answer with "Answer: <option letter>".
- Keep questions clear, factual, and non-repetitive.

### Context:
{context}

### Output format:
1. <Question text>
A. <Option A>
B. <Option B>
C. <Option C>
D. <Option D>
Answer: <Correct Option>

Generate the questions now.
"""


In [49]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

prompt = PromptTemplate(
    input_variables=["context", "num_questions"],
    template=mcq_prompt
)

llm_chain = LLMChain(prompt=prompt, llm=model)

  llm_chain = LLMChain(prompt=prompt, llm=model)


In [50]:
context_text = "about ipl 2025"

response = llm_chain.invoke({
    "context": context_text,
    "num_questions": 5,
})

In [51]:
print(response["text"])

Since the provided context "about ipl 2025" is extremely vague,  I will have to make assumptions to create meaningful multiple-choice questions.  These questions will focus on plausible aspects of an IPL 2025 season, acknowledging the lack of concrete information.

1. Which of the following is a MOST likely change to the IPL 2025 format compared to previous seasons?
A. A reduction in the number of teams.
B. The introduction of a new playoff system.
C. A significant shortening of the regular season.
D. The elimination of the auction process for player selection.
Answer: B

2.  Speculation suggests a potential increase in the IPL 2025 media rights revenue. What is the MOST likely driver for this increase?
A. A decrease in the overall viewership.
B. Reduced broadcasting costs.
C. Increased global interest and viewership.
D. A decline in sponsorship deals.
Answer: C

3.  Considering the usual player transfer dynamics, which of the following scenarios is MOST likely for IPL 2025?
A. All tea

In [52]:
context_text = "about black hole"

response = llm_chain.invoke({
    "context": context_text,
    "num_questions": 3,
})

In [53]:
print(response["text"])

1. What is a black hole?
A. A type of star that is extremely hot and bright.
B. A region of spacetime with gravity so strong that nothing, not even light, can escape.
C. A massive planet with a very dense atmosphere.
D. A large cloud of gas and dust in space.
Answer: B

2.  Which characteristic of a black hole prevents even light from escaping?
A. Its extremely low temperature.
B. Its intense magnetic field.
C. Its incredibly strong gravitational pull.
D. Its rapid rotation.
Answer: C

3. What is the point of no return for a black hole called?
A. Event horizon
B. Singularity
C. Schwarzschild radius
D. Accretion disk
Answer: A
