In [2]:
import os

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

In [5]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = "RAG"

In [6]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [7]:
# os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=GOOGLE_API_KEY
)


In [10]:
from langchain_google_genai import ChatGoogleGenerativeAI
model = ChatGoogleGenerativeAI(model = "gemini-1.5-flash", convert_system_message_to_human=True)

In [11]:
print(model.invoke("hi").content)

Hi there! How can I help you today?


In [12]:
from langchain.chains import create_retrieval_chain

In [13]:
from langchain.chains.combine_documents import create_stuff_documents_chain

In [14]:
from langchain_chroma import Chroma

In [15]:
from langchain_core.prompts import ChatPromptTemplate

In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [17]:
from langchain_core.prompts import MessagesPlaceholder

In [18]:
from PyPDF2 import PdfReader

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import os

# Load multiple PDFs
def load_pdfs(folder):
    texts = []
    for file in os.listdir(folder):
        if file.endswith(".pdf"):
            reader = PdfReader(os.path.join(folder, file))
            full_text = ""
            for page in reader.pages:
                if page_text := page.extract_text():
                    full_text += page_text
            texts.append(full_text)
    return texts

# Convert to Document objects
def create_documents_from_texts(texts):
    return [
        Document(page_content=text, metadata={"source": f"doc_{i}"})
        for i, text in enumerate(texts)
    ]

# Chunking
folder_path = "../pdf_docs"
texts = load_pdfs(folder_path)
docs = create_documents_from_texts(texts)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [54]:
splits[:1]

[Document(metadata={'source': 'doc_0'}, page_content='2025 Indian Premier League\nDates 22 March – 3 June 2025\nAdministrator(s)Board of Control for\nCricket in India\nCricket format Twenty20\nTournament\nformat(s)Group stage and playoffs\nChampions Royal Challengers\nBengaluru (1st title)\nRunners-up Punjab Kings\nParticipants 10\nMatches 74\nMost valuable\nplayerSuryakumar Yadav\n(Mumbai Indians)\nMost runs Sai Sudharsan (Gujarat\nTitans) (759)\nMost wickets Prasidh Krishna (Gujarat\nTitans) (25)\nOfﬁcial website iplt20.com (http://iplt20.co\nm)\n2025 Indian Premier League\nThe 2025 Indian Premier League , also known as IPL 18 and\nbranded as TATA IPL 2025 , was the 18th edition of the Indian Premier\nLeague . The tournament featured 10 teams  competing in 74 matches. It\nbegan on 22 March and was held across 13 venues before being\nsuspended on 9 May due to the 2025 India and Pakistan crisis . The\nmatches resumed from 17 May across six venues, and the final  was\nrescheduled from 2

In [55]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=gemini_embeddings,
)

In [56]:
retriever = vectorstore.as_retriever()

In [57]:
retriever

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x10f0e3700>, search_kwargs={})

In [58]:
system_prompt = (
    "You are an assistant for question answering tasks. "
    "Use the following pieces of retrieved context to answer the question "
    "If you don't know the answer, say that you don't know."
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

In [59]:
chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [60]:
question_answering_chain = create_stuff_documents_chain(model, chat_prompt)

In [61]:
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [64]:
rag_chain.invoke({"input": "who scored leading runs in whole ipl 2025?"})["answer"]

'The provided text mentions Priyansh Arya scoring 70 runs in one match and Ashutosh Sharma scoring 66* runs in another.  There is no information on who scored the most runs overall in IPL 2025.'

In [46]:
rag_chain.invoke({"input": "what is big bang theory?"})["answer"]

"The Big Bang theory is a cosmological model for the universe's evolution.  It's supported by Big Bang nucleosynthesis and the discovery of the cosmic microwave background radiation.  The theory rests on Einstein's general relativity and the cosmological principle."

In [47]:
print(rag_chain.invoke({"input": "provide 4 questions of general knowledge with their answers"})["answer"])

Here are four general knowledge questions with their answers, based on the provided text:

1. **Question:** What is the name of the instrument used to measure the distance traveled in vehicles?  **Answer:** Odometer.
2. **Question:** Who wrote the book "Dreams from My Father: A Story of Race and Inheritance"? **Answer:** Former US President Barack Obama.
3. **Question:** When was penicillin discovered? **Answer:** In the year 1928.
4. **Question:** Which city is known as the "Windy City"? **Answer:** Chicago.
