In [8]:
from langchain_community.document_loaders import PyPDFLoader

pdf_loader = PyPDFLoader("annualReport.pdf")
pdf_doc = pdf_loader.load()

In [9]:
len(pdf_doc)

330

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)
splitted_pdf_doc = text_splitter.split_documents(pdf_doc)

In [11]:
len(splitted_pdf_doc)

1267

In [24]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings

load_dotenv()
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

embedding = HuggingFaceEmbeddings(model="all-MiniLM-L6-v2")

In [30]:
len(embedding.embed_query("Hello Mister"))

384

In [51]:
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain import hub
import pprint
from uuid import uuid4
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import os
load_dotenv()

os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')

os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')

pc = Pinecone()

In [55]:
index_name = "nazaraannualreport-cosine"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "cosine",
        spec = ServerlessSpec(cloud = "aws", region = "us-east-1")
    )

## Loading the Index
index = pc.Index("nazaraannualreport-cosine")

## Creating Vector Store
vector_store = PineconeVectorStore(index=index, embedding=embedding)

## Universal Identification number
uuids = [str(uuid4()) for _ in range(len(splitted_pdf_doc))]

vector_store.add_documents(documents=splitted_pdf_doc, ids=uuids)

['9005c9ac-7ab6-4509-bb75-485c910e5661',
 '4f8655dc-7743-41d0-8098-54c9fc9d6480',
 '9dab2c5a-2d34-44a0-b77d-cc2f7ab82c5f',
 '7482c519-d212-455b-8e50-a38951637d06',
 'a177721d-dda9-45ca-8538-f84022fa647d',
 'bc6995f9-4d6a-4943-ad35-4dba9f86e672',
 '4ad2324e-7ffc-4f72-a0e8-8fb6640941f1',
 'c1b8be11-e084-4266-8c8a-c6ec25664fdd',
 'e729dbf0-e30c-42d3-a4d0-143c2eebbbb3',
 '61e32352-644d-4aac-b41b-63db6fb985fa',
 '990f29ba-1d34-47bd-b4b0-a045e1820714',
 'b80ab35e-a55a-41e8-a47c-84136bc750d7',
 'f0b5bc47-867b-404e-8b65-670af014ba25',
 '6a9084d7-1646-4d86-8b18-a8f0e2a9028f',
 '4d0063df-ab91-4234-8e1c-38e83843cfed',
 '4bacb41f-f5ee-4814-93fc-66ebe30ccbe2',
 '8359102c-e541-4536-8a07-8177e710d8e4',
 '500ab4bb-527d-4e20-965c-e95b50e4bf75',
 '51cd4896-1551-46c3-986e-fd2d17be0316',
 'dd40d216-1e8f-441c-bebe-23db0d17e24b',
 'db3a6639-7c3a-4f90-b51e-00241a67eb7f',
 '112b3c97-14c9-44c1-937f-2e91cad81444',
 '77829679-57ba-433c-b48a-4221cbe9e418',
 'f797b6fb-13eb-48a2-93b0-e634b8f7a897',
 '3ec7ecd5-6d33-

In [56]:
retriever=vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.5}
)

model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

rag_chain.invoke("What was the revenue for Kiddopia in year 2023 and 2024?")



"Kiddopia's revenue was ₹219.4 crores in FY 2023-24 and ₹220.6 crores in FY 2022-23.  This represents relatively flat revenue growth year-over-year."

In [57]:
index_name = "nazaraannualreport-euclidean"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "euclidean",
        spec = ServerlessSpec(cloud = "aws", region = "us-east-1")
    )

## Loading the Index
index = pc.Index("nazaraannualreport-euclidean")

## Creating Vector Store
vector_store = PineconeVectorStore(index=index, embedding=embedding)

## Universal Identification number
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(splitted_pdf_doc))]

vector_store.add_documents(documents=splitted_pdf_doc, ids=uuids)

['2ebecc03-873c-46d8-bdae-fdaaddd5c600',
 'abc751b6-dd7f-4eda-9d54-b7a6597b27ad',
 'e5c7c115-03b2-4ffe-af9d-52f71208b222',
 'b27aa027-c555-4635-a96d-5f16edffb67d',
 '61b0d710-4f01-466f-88b7-4c0842aebe25',
 '4ad19750-f6d8-48b8-9442-7b29cc92e7d5',
 '11210fa0-9482-410e-9d8d-480c9ae05c87',
 '2b5ef45b-fa1a-41e2-ac5d-650ec7079f84',
 '3d64ad3f-b5b9-4aca-ae4a-a35a297bbbfe',
 'e0989744-45ea-430b-b177-de2eef38907c',
 '4f64e23d-652a-44f2-bb5e-81d71dd95844',
 '86d0a628-a475-4950-a501-029682c92e7b',
 'd6d20145-768c-4303-90f5-1f9cac5179e0',
 'a6d862d3-41b5-410f-9450-c4236f06856a',
 '7c9c2786-f3e8-4f52-afb5-7cc18073b2e7',
 'a32489d4-20b4-4db6-99b4-12d6a4259e98',
 '7ddfdb2b-62df-4164-be2c-61a06b6edcd3',
 'a5dd8cb2-ee19-4db6-8a1d-acb3d2f9e8d1',
 '84a913e0-1feb-4720-b146-10b3b0a5c014',
 'b690f454-1f8f-44a1-a0c0-9a8aaaa4d32a',
 '4fb5ab22-eb05-4f0d-86cf-46d23297ff52',
 '42736479-64ce-4947-b2ac-46bb91b8c6dd',
 '8aac30e3-776b-41b7-b6cd-14d3f223cf15',
 'a7d4b646-6408-4295-ab3c-1a1d8ee0212c',
 '5ad7e46a-1529-

In [58]:
retriever=vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.5}
)

model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

rag_chain.invoke("What was the revenue for Kiddopia in year 2023 and 2024?")



"Kiddopia's revenue was ₹219.4 crores in FY 2023-24 and ₹220.6 crores in FY 2022-23.  This represents relatively flat revenue growth year-over-year."

In [60]:
index_name = "nazaraannualreport-dotproduct"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "dotproduct",
        spec = ServerlessSpec(cloud = "aws", region = "us-east-1")
    )

## Loading the Index
index = pc.Index("nazaraannualreport-dotproduct")

## Creating Vector Store
vector_store = PineconeVectorStore(index=index, embedding=embedding)

## Universal Identification number
uuids = [str(uuid4()) for _ in range(len(splitted_pdf_doc))]

vector_store.add_documents(documents=splitted_pdf_doc, ids=uuids)

['33a4e509-10aa-4c56-a44a-06e1de9e159e',
 '05349623-f3bd-4b8d-93bf-918f4824b0f5',
 'da17e92e-8c1d-4a61-8c63-bb7728b6b41d',
 '51f0f425-3d90-4d8d-8d05-2764ebdb2e3d',
 '4e304c72-2072-46bd-a34f-292cda17e266',
 '04c5dbb5-fc65-443e-a1a7-698828a1ba4b',
 '196de0c6-e328-4e28-b81e-6fa5a768259a',
 '5c861b72-57ee-437c-966f-4764c9ac2fbe',
 '16c25a68-ac03-4890-b950-bc4ac91f0a2b',
 '70f1ba2f-1b88-473e-8905-3d4f18eb119f',
 'ab4edcdc-9224-4264-9b27-be4f6a50ffb9',
 '0782dc05-2439-4340-babe-f61b357022ca',
 '974fac9d-1c23-4eb1-954c-9425018ad1d5',
 '79f4b9ed-b179-4eec-8953-f6ccfaf623ea',
 '93d7e0a2-b50c-4b7a-bbfc-d65e5e859cf5',
 '2dc3c3fd-4276-4f23-9343-67bccd6c23c7',
 'ccbe6948-8e0d-4ba7-b25a-780360c355b4',
 '0d3540f6-6bfd-4978-9dd8-de827f26b787',
 'ebf15a11-1526-4d8a-b1b8-bdd086644af0',
 '843f03c9-6fed-4a28-a81c-c13da5fd71c0',
 '6278d2b3-5939-49fa-b5f5-b0667edd81f1',
 '4f8df76f-f8a9-4958-84ce-390e4d338700',
 'a48ef6fe-5579-4f15-8b70-8555a8f9bdf5',
 '0d561942-26db-46d2-8ba2-7e4d0b835d4b',
 'c38fe352-e575-

In [61]:
retriever=vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.5}
)

model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

prompt = hub.pull("rlm/rag-prompt")
pprint.pprint(prompt.messages)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

rag_chain.invoke("What was the revenue for Kiddopia in year 2023 and 2024?")



[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


"Kiddopia's revenue was ₹219.4 Crores in FY 2023-24 and ₹220.6 Crores in FY 2022-23.  This represents relatively flat revenue year-over-year."