# QA Bot for PDF Files

In [1]:
!pip install -qU openai
!pip install -qU langchain
!pip install -qU langchain-pinecone
!pip install -qU pypdf
!pip install -qU langchain-google-genai
!pip install -qU langchain-pinecone langchain-openai langchain

In [72]:
# Import necessary libraries
import time
import langchain
import os
from langchain import OpenAI
from langchain_pinecone import Pinecone
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from google.colab import userdata
from pinecone import ServerlessSpec
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

In [73]:
pinecone_api_key = userdata.get("PINECONE_API_KEY")
gemini_api_key =  userdata.get("GOOGLE_API_KEY")
inference_api_key =  userdata.get("HF_API_KEY")
openai_api_key =  userdata.get("OPENAI_API_KEY")
index_name = "nagp"
os.environ["GOOGLE_API_KEY"] = gemini_api_key
os.environ["PINECONE_API_KEY"] = pinecone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key

In [74]:
llm = OpenAI(temperature=0.9, max_tokens=500)

In [75]:
#Initialize open ai embeddings
embeddings = OpenAIEmbeddings()

In [76]:
# Initialize Conversation Buffer Window Memory
memory = ConversationBufferWindowMemory(window_size=5)

In [77]:
# Function to extract text from PDF
def extract_pages_from_pdf(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    return pages

In [78]:
# Function to chunk text using RecursiveCharacterTextSplitter
def chunk_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(text)
    return chunks

In [79]:
# Main workflow
# Extract text from a sample PDF
pdf_text = extract_pages_from_pdf('AssignmentSupportDocument.pdf')

# Chunk the extracted text
text_chunks = chunk_text(pdf_text)

In [80]:
docsearch = Pinecone.from_documents(text_chunks, embeddings, index_name=index_name)
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=docsearch.as_retriever(), memory=memory)
chain.memory.output_key = "answer"
langchain.debug=True

In [81]:
query = "Direct taxes?"
docs = docsearch.similarity_search(query)
print(docs[0])

page_content='25 
 Part B  
Hon’ble Speaker Sir,  
Direct taxes  
87. Over the last ten years, the direct tax collections have 
more than trebled and the return filers swelled to 2.4 times.  
I would like to assure the taxpayers that their contributions have 
been used wisely for the development of the country and 
welfare of its people. I appreciate the tax payers for their 
support.  
88. The Government has reduced and rationalized tax rates. 
Under the new tax scheme, there  is now no tax liability for tax 
payers with income up to ₹ 7 lakh, up from ₹ 2.2 lakh in the 
financial year 2013 -14. The threshold for presumptive taxation 
for retail businesses was increased from ₹ 2 crore to ₹ 3 crore. 
Similarly, the threshold for professionals eligible for presumptive 
taxation was increased from ₹ 50 lakh to  ₹ 75 Lakh.  Also, 
corporate tax rate was decreased from 30 per cent to 22 per cent 
for existing domestic companies and to 15 per cent for certain 
new manufacturing companies.' m

In [82]:
query = "Direct taxes"
chain.invoke({"question": query}, return_only_outputs=True)
# chain.invoke({"question": query, "metadata":"{source:'AssignmentSupportDocument.pdf'}"}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Direct taxes",
  "history": ""
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "25 \n Part B  \nHon’ble Speaker Sir,  \nDirect taxes  \n87. Over the last ten years, the direct tax collections have \nmore than trebled and the return filers swelled to 2.4 times.  \nI would like to assure the taxpayers that their contributions have \nbeen used wisely for the development of the country and \nwelfare of its people. I appreciate the tax payers for their \nsupport.  \n88. The Government has reduced and rationalized tax rates. \nUnder the new tax scheme, there  is now no tax liab

{'answer': ' Direct taxes collections have increased over the last ten years and the government has used them for the development of the country and the welfare of its people. The corporate tax rate has decreased. In the last five years, there has been a focus on improving tax-payer services.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [83]:
query = "Green Energy"
chain.invoke(query, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Green Energy",
  "history": "Human: Direct taxes\nAI:  Direct taxes collections have increased over the last ten years and the government has used them for the development of the country and the welfare of its people. The corporate tax rate has decreased. In the last five years, there has been a focus on improving tax-payer services.\n"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "offshore wind energy potential for initial capacity of \none giga -watt.  \nb. Coal gasification and liquefaction capacity of 100 MT \nwill be set up by 2030. This will also help in reducing

{'answer': ' The mentioned portion of the document does not specifically mention green energy. \n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [84]:
query = "Coal gasification"
chain.invoke(query, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Coal gasification",
  "history": "Human: Direct taxes\nAI:  Direct taxes collections have increased over the last ten years and the government has used them for the development of the country and the welfare of its people. The corporate tax rate has decreased. In the last five years, there has been a focus on improving tax-payer services.\n\nHuman: Green Energy\nAI:  The mentioned portion of the document does not specifically mention green energy. \n"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "offshore wind energy potential for initial capacity of \none giga -watt. 

{'answer': ' By 2030, there are plans to set up a coal gasification and liquefaction capacity of 100 MT which will aid in decreasing the need for imports of natural gas, methanol, and ammonia.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [85]:
query = "how much corporate tax has decreased"
chain.invoke(query, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "how much corporate tax has decreased",
  "history": "Human: Direct taxes\nAI:  Direct taxes collections have increased over the last ten years and the government has used them for the development of the country and the welfare of its people. The corporate tax rate has decreased. In the last five years, there has been a focus on improving tax-payer services.\n\nHuman: Green Energy\nAI:  The mentioned portion of the document does not specifically mention green energy. \n\nHuman: Coal gasification\nAI:  By 2030, there are plans to set up a coal gasification and liquefaction capacity of 100 MT which will aid in decreasing the need for imports of natural gas, methanol, and ammonia.\n"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[cha

{'answer': ' The corporate tax rate has decreased from 30 per cent to 22 per cent and to 15 per cent for certain new manufacturing companies.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [86]:
query = "Direct taxes collections have increased over the last how many years"
chain.invoke(query, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Direct taxes collections have increased over the last how many years",
  "history": "Human: Direct taxes\nAI:  Direct taxes collections have increased over the last ten years and the government has used them for the development of the country and the welfare of its people. The corporate tax rate has decreased. In the last five years, there has been a focus on improving tax-payer services.\n\nHuman: Green Energy\nAI:  The mentioned portion of the document does not specifically mention green energy. \n\nHuman: Coal gasification\nAI:  By 2030, there are plans to set up a coal gasification and liquefaction capacity of 100 MT which will aid in decreasing the need for imports of natural gas, methanol, and ammonia.\n\nHuman: how much corporate tax has decreased\nAI:  The corporate tax rate has decreased from 30 per cent to 22 per cent and to 15 per cent for certain new ma

{'answer': ' The direct tax collections have increased over the last 5 years.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [87]:
query = "The corporate tax rate has decreased from"
chain.invoke(query, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "The corporate tax rate has decreased from",
  "history": "Human: Direct taxes\nAI:  Direct taxes collections have increased over the last ten years and the government has used them for the development of the country and the welfare of its people. The corporate tax rate has decreased. In the last five years, there has been a focus on improving tax-payer services.\n\nHuman: Green Energy\nAI:  The mentioned portion of the document does not specifically mention green energy. \n\nHuman: Coal gasification\nAI:  By 2030, there are plans to set up a coal gasification and liquefaction capacity of 100 MT which will aid in decreasing the need for imports of natural gas, methanol, and ammonia.\n\nHuman: how much corporate tax has decreased\nAI:  The corporate tax rate has decreased from 30 per cent to 22 per cent and to 15 per cent for certain new manufacturing companies.\n\nH

{'answer': ' The corporate tax rate has decreased from 30% to 22% for existing domestic companies and to 15% for certain new manufacturing companies.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [88]:
query = "how much coal gasification and liquefaction capacity will we reach"
chain.invoke(query, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "how much coal gasification and liquefaction capacity will we reach",
  "history": "Human: Green Energy\nAI:  The mentioned portion of the document does not specifically mention green energy. \n\nHuman: Coal gasification\nAI:  By 2030, there are plans to set up a coal gasification and liquefaction capacity of 100 MT which will aid in decreasing the need for imports of natural gas, methanol, and ammonia.\n\nHuman: how much corporate tax has decreased\nAI:  The corporate tax rate has decreased from 30 per cent to 22 per cent and to 15 per cent for certain new manufacturing companies.\n\nHuman: Direct taxes collections have increased over the last how many years\nAI:  The direct tax collections have increased over the last 5 years.\n\nHuman: The corporate tax rate has decreased from\nAI:  The corporate tax rate has decreased from 30% to 22% for existing domestic compan

{'answer': ' We will reach a coal gasification and liquefaction capacity of 100 MT by 2030.\n',
 'sources': 'AssignmentSupportDocument.pdf'}