# Structure of this notebook
#### Load a PDF and covert into chuks (text chunks)
#### Use OpenAIEmbeddings -> Will convert chunks into vectors
#### Store vectors in VectorSearchDB -> Apply Similarity Search so that we can search inside the document

In [27]:
import openai
import langchain
import os
from pinecone import Pinecone, ServerlessSpec

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.vectorstores import Pinecone as PC

from langchain.llms import OpenAI

In [19]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
## Read the document
def read_doc(filename):
    file_loader=PyPDFLoader(filename)
    documents=file_loader.load()
    return documents

In [5]:
docs = read_doc("budget_speech.pdf")
docs

[Document(page_content='GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024', metadata={'source': 'budget_speech.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'budget_speech.pdf', 'page': 1}),
 Document(page_content=' \nCONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28 \n  \n  ', metadata={'source': 'budget_speech.pdf', 'page': 2}),
 Document(page_content='', metadata={'source': 'budget_speech.pdf', 'page': 3}),
 Document(page_content='1 \n I

In [6]:
len(docs)

32

In [7]:
## Devide the docs into chunks
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(docs)
    return chunks

In [8]:
chunks=chunk_data(docs)
chunks

[Document(page_content='GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024', metadata={'source': 'budget_speech.pdf', 'page': 0}),
 Document(page_content='CONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28', metadata={'source': 'budget_speech.pdf', 'page': 2}),
 Document(page_content='1 \n Interim Budget 2024 -2025  \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1, 2024  \nHon’ble Speaker,  \n I present the Interim Budget for 2024 -25.  \nI

In [9]:
## Embedding technique of OpenAI
embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])


  warn_deprecated(


In [10]:
vectors=embeddings.embed_query("How are you?")
vectors, len(vectors)

([-0.016785908412158042,
  -0.012151270116836888,
  0.006627965687606043,
  -0.026018159342696565,
  -0.01616878054948239,
  0.01762520513661754,
  -0.011114493891931487,
  -0.0099234347700346,
  -0.018131250431046412,
  -0.010417137246439636,
  0.0278695466560138,
  0.0016508201293049156,
  -0.00733766375413787,
  -0.011651395765758651,
  0.007238923072592348,
  -0.015391197915142053,
  0.028363250063741408,
  -0.011830363367475229,
  0.013959458032731997,
  -0.0205997656070393,
  0.00252868606461311,
  0.006344086460993312,
  0.0009997490514024384,
  -0.008263357876457773,
  -0.01588490039154709,
  -0.007794339173455263,
  0.025117151322085612,
  -0.012404292764051323,
  0.02230304282936083,
  -0.02515417884483454,
  0.005609702758413819,
  0.0076955989575710265,
  -0.013169533511690397,
  0.004014424029647617,
  0.008757060352862809,
  -0.022290699080014423,
  0.004020595438659533,
  -0.01043565193913667,
  0.0203282291984504,
  -0.006337915051981395,
  0.0270302499315543,
  0.00125

In [25]:
# Vector Search DB in pinecode
# pinecone.init(
#     api_key="2d024e2a-7dfd-45bc-a09c-d4257314d79e",
#     environment="gcp-starter"
# )
pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )
index_name="langchainvector"

In [39]:
index=PC.from_documents(chunks, embeddings, index_name=index_name)

In [31]:
# Cosine Similarity Retrieve Results from Vector DB
def retrieve_query(query, k=2):
    matching_results=index.similarity_search(query, k=k)
    return matching_results

In [34]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI

In [36]:
llm=OpenAI(temperature=0.5)
chain=load_qa_chain(llm, chain_type="stuff")

In [43]:
## Search answers from Vector DB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search, question=query)
    return response

In [44]:
our_query="What is this document about?"
answer=retrieve_answers(our_query)
print(answer)

[Document(page_content='CONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28', metadata={'page': 2.0, 'source': 'budget_speech.pdf'}), Document(page_content='CONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nR