In [1]:
import os

from dotenv import load_dotenv
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

load_dotenv("secret/.env")

Python-dotenv could not parse statement starting at line 1


True

In [2]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [3]:
doc = read_doc("documents")
len(doc)

58

In [4]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return docs

In [5]:
documents = chunk_data(doc)
documents[:2]

[Document(metadata={'source': 'documents/budget_speech.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'),
 Document(metadata={'source': 'documents/budget_speech.pdf', 'page': 1}, page_content='')]

In [6]:
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7fc442d1dff0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7fc40b7f8b50>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [7]:
vectors = embeddings.embed_query("How are you ?")
len(vectors)

1536

In [8]:
# index = PineconeVectorStore.from_documents(doc, index_name=os.getenv("PINECONE_INDEX"), embedding=embeddings)
index = PineconeVectorStore(index_name=os.getenv("PINECONE_INDEX"), embedding=embeddings)

In [9]:
def retrieve_query(query, k=2):
    matching_results = index.similarity_search(query, k=k)
    return matching_results

In [10]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")

In [11]:
def retrieve_answers(query):

    doc_search = retrieve_query(query)
    print(doc_search)
    response = chain.run(input_documents=doc_search, question=query)

    return response

In [12]:
our_query = "How much the agriculture target will be increased and by how many crores ?"
answer = retrieve_answers(our_query)
print(answer)

[Document(metadata={'page': 10.0, 'source': 'documents/budget_speech.pdf'}, page_content="7 \n \n \n farmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.   \n22. Now to make India a global hub for ' Shree Anna' , the Indian Institute \nof Millet Research, Hyderabad  will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to further enable activities of \nfishermen, fish vendors, and micro & small enterprises, improve value chain \nefficiencies, and expand the market. \nCooperation \n25. For farmers, especially small and marginal farmers, and other \nmarginalised sections, the government is promoting coo

  warn_deprecated(


 The agriculture credit target will be increased to ` 20 lakh crore.


In [13]:
our_query = "How is the agriculture doing ?"
answer = retrieve_answers(our_query)
print(answer)

[Document(metadata={'page': 10.0, 'source': 'documents/budget_speech.pdf'}, page_content="7 \n \n \n farmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.   \n22. Now to make India a global hub for ' Shree Anna' , the Indian Institute \nof Millet Research, Hyderabad  will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to further enable activities of \nfishermen, fish vendors, and micro & small enterprises, improve value chain \nefficiencies, and expand the market. \nCooperation \n25. For farmers, especially small and marginal farmers, and other \nmarginalised sections, the government is promoting coo