In [None]:
#!pip install chromadb 
#!pip install langchain
#!pip install langchain-community
#!pip install langchain-google-genai
#!pip install google-generativeai
#!pip install langchain-pinecone
#!pip install pypdf
#!pip install python-dotenv



In [1]:
from langchain.prompts import PromptTemplate 
from langchain.vectorstores import Chroma,Pinecone   
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.document_loaders import TextLoader,PyPDFLoader 
from langchain.chains import VectorDBQA,RetrievalQA, LLMChain 
from langchain.retrievers.multi_query import MultiQueryRetriever 
from langchain_google_genai import ChatGoogleGenerativeAI 
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_pinecone import PineconeVectorStore

In [2]:
loader = PyPDFLoader("budget_speech.pdf")
documents = loader.load()
documents

[Document(metadata={'producer': 'Adobe PDF Library 24.2.159', 'creator': 'Acrobat PDFMaker 24 for Word', 'creationdate': '2024-07-23T00:18:17+05:30', 'author': 'hss', 'company': 'HCL Infosystems Limited', 'moddate': '2024-07-23T00:30:42+05:30', 'sourcemodified': 'D:20240722173931', 'source': 'budget_speech.pdf', 'total_pages': 62, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nJuly 23,  2024'),
 Document(metadata={'producer': 'Adobe PDF Library 24.2.159', 'creator': 'Acrobat PDFMaker 24 for Word', 'creationdate': '2024-07-23T00:18:17+05:30', 'author': 'hss', 'company': 'HCL Infosystems Limited', 'moddate': '2024-07-23T00:30:42+05:30', 'sourcemodified': 'D:20240722173931', 'source': 'budget_speech.pdf', 'total_pages': 62, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF Library 24.2.159', 'creator': 'Acrobat PDFMaker 24 for Word', 'creationdate': '2024-07-

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [4]:

# Access your API key
gemini_key = "AIzaSyDCnLmUblLdWOdo7Q_yXuDvOIiaKIW7QSA"
pinecone_key = "pcsk_2hmC5d_T6DYwfoumb6BqPrFuAXuA2CMfVD2TdoSeNxjt7Gr8hrB18rN2nEcz9BtqzrwTsc"


In [5]:
embeddings = GoogleGenerativeAIEmbeddings(
    model='models/embedding-001',
    google_api_key=gemini_key,
    task_type="retrieval_query"
)



from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory

safety_settings = {
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                    }

chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=gemini_key,
    temperature=0.3,
    safety_settings=safety_settings
)

In [6]:
from langchain_pinecone import PineconeVectorStore
import os
os.environ['PINECONE_API_KEY'] = pinecone_key
index_name = "langchain-test-index-gemini"

In [7]:
vectordb = PineconeVectorStore.from_documents(
    documents=texts,
    embedding=embeddings,
    index_name=index_name
)

In [8]:
prompt_template = """
You are an intelligent assistant specialized in analyzing and summarizing Indian Budget speeches and financial documents.

## Instructions:
Use the **context** provided below to answer the **question** clearly and factually. Focus only on the content from the document — do not generate your own opinions or assumptions.

If the answer is **not found in the context**, respond with:
**"The answer is not available in the provided document."**

## Additional Guidelines:
- Avoid political bias or speculation
- Focus on budgetary figures, schemes, reforms, and policy announcements
- Do not infer or hallucinate numbers or statements

---

### Context:
{context}

### Question:
{question}

### Answer:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])


In [9]:
retriever = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(search_kwargs={"k": 5}),
    llm=chat_model
)

# 8. Final QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

In [10]:
response = qa_chain.invoke({"query": "What are the key findings in this PDF?"})
print(response["result"])

Here's a summary of the key findings from the provided document:

**Agriculture:**

*   A comprehensive review of the agriculture research setup will be undertaken to focus on raising productivity and developing climate-resilient varieties.
*   Funding will be provided in challenge mode, including to the private sector, and domain experts will oversee the research.
*   109 new high-yielding and climate-resilient varieties of 32 field and vegetable crops have been released.
*   Large-scale clusters for vegetable production will be developed closer to major consumption centers.
*   The government will promote Farmer-Producer Organizations, cooperatives, and start-ups for vegetable supply chains.

**Corporate Exit and Insolvency Resolution:**

*   The services of the Centre for Processing Accelerated Corporate Exit (C-PACE) will be extended for the voluntary closure of LLPs to reduce closure time.
*   The IBC has resolved more than 1,000 companies, leading to a direct recovery of over ₹3.