# QA Bot for PDF Files

In [1]:
!pip install -qU openai
!pip install -qU langchain
!pip install -qU langchain-pinecone
!pip install -qU pypdf
!pip install -qU langchain-google-genai
!pip install -qU langchain-openai

In [2]:
# Import necessary libraries
import time
import langchain
import os
from langchain import OpenAI
from langchain_pinecone import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQAWithSourcesChain
from google.colab import userdata
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings

In [3]:
pinecone_api_key = userdata.get("PINECONE_API_KEY")
gemini_api_key =  userdata.get("GOOGLE_API_KEY")
inference_api_key =  userdata.get("HF_API_KEY")
openai_api_key =  userdata.get("OPENAI_API_KEY")
index_name = "nagp"
os.environ["GOOGLE_API_KEY"] = gemini_api_key
os.environ["PINECONE_API_KEY"] = pinecone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key

In [4]:
llm = OpenAI(temperature=0.9, max_tokens=500)

  warn_deprecated(


In [5]:
#Initialize open ai embeddings
embeddings = OpenAIEmbeddings()

In [6]:
# Initialize Conversation Buffer Window Memory
memory = ConversationBufferWindowMemory(window_size=5)

In [7]:
# Function to extract text from PDF
def extract_pages_from_pdf(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    return pages

In [8]:
# Function to chunk text using RecursiveCharacterTextSplitter
def chunk_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(text)
    return chunks

In [9]:
# Main workflow
# Extract text from a sample PDF
pdf_text = extract_pages_from_pdf('AssignmentSupportDocument.pdf')

# Chunk the extracted text
text_chunks = chunk_text(pdf_text)

In [10]:
docsearch = Pinecone.from_documents(text_chunks, embeddings, index_name=index_name)
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=docsearch.as_retriever(), memory=memory)
chain.memory.output_key = "answer"
#langchain.debug=True

In [11]:
query = "Direct taxes?"
docs = docsearch.similarity_search(query)
print(docs[0])

page_content='25 
 Part B  
Hon’ble Speaker Sir,  
Direct taxes  
87. Over the last ten years, the direct tax collections have 
more than trebled and the return filers swelled to 2.4 times.  
I would like to assure the taxpayers that their contributions have 
been used wisely for the development of the country and 
welfare of its people. I appreciate the tax payers for their 
support.  
88. The Government has reduced and rationalized tax rates. 
Under the new tax scheme, there  is now no tax liability for tax 
payers with income up to ₹ 7 lakh, up from ₹ 2.2 lakh in the 
financial year 2013 -14. The threshold for presumptive taxation 
for retail businesses was increased from ₹ 2 crore to ₹ 3 crore. 
Similarly, the threshold for professionals eligible for presumptive 
taxation was increased from ₹ 50 lakh to  ₹ 75 Lakh.  Also, 
corporate tax rate was decreased from 30 per cent to 22 per cent 
for existing domestic companies and to 15 per cent for certain 
new manufacturing companies.' m

In [12]:
query = "Direct taxes"
chain.invoke({"question": query}, return_only_outputs=True)
# chain.invoke({"question": query, "metadata":"{source:'AssignmentSupportDocument.pdf'}"}, return_only_outputs=True)

{'answer': ' The government has reduced and rationalized tax rates, specifically decreasing the corporate tax rate from 30 percent to 22 percent.',
 'sources': ''}

In [13]:
query = "Green Energy"
chain.invoke(query, return_only_outputs=True)

{'answer': ' The potential for offshore wind energy is being considered and the government plans to support the expansion and infrastructure for green energy vehicles. \n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [14]:
query = "Coal gasification"
chain.invoke(query, return_only_outputs=True)

{'answer': ' India plans to set up 100 MT of coal gasification and liquefaction capacity by 2030, aiming to lower imports of natural gas, methanol, and ammonia. \n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [15]:
query = "how much corporate tax has decreased"
chain.invoke(query, return_only_outputs=True)

{'answer': ' The corporate tax rate was decreased from 30% to 22% for existing domestic companies and to 15% for certain new manufacturing companies.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [16]:
query = "Direct taxes collections have increased over the last how many years"
chain.invoke(query, return_only_outputs=True)

{'answer': ' Direct taxes collections have increased over the last 6 years.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [17]:
query = "The corporate tax rate has decreased from"
chain.invoke(query, return_only_outputs=True)

{'answer': ' The corporate tax rate has decreased from 30% to 22% for existing domestic companies and to 15% for certain new manufacturing companies.\n',
 'sources': 'AssignmentSupportDocument.pdf'}

In [18]:
query = "how much coal gasification and liquefaction capacity will we reach"
chain.invoke(query, return_only_outputs=True)

{'answer': ' It is mentioned that a coal gasification and liquefaction capacity of 100 MT will be set up by 2030. However, it is not specified how much capacity will be reached in total. \n',
 'sources': 'AssignmentSupportDocument.pdf'}