In [41]:
pip install langchain-pinecone


Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.13-py3-none-any.whl.metadata (8.6 kB)
Collecting pinecone<8.0.0,>=6.0.0 (from pinecone[asyncio]<8.0.0,>=6.0.0->langchain-pinecone)
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting langchain-openai>=0.3.11 (from langchain-pinecone)
  Downloading langchain_openai-1.1.6-py3-none-any.whl.metadata (2.6 kB)
Collecting simsimd>=5.9.11 (from langchain-pinecone)
  Downloading simsimd-6.5.12-cp310-cp310-win_amd64.whl.metadata (71 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone<8.0.0,>=6.0.0->pinecone[asyncio]<8.0.0,>=6.0.0->langchain-pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting aiohttp-retry<3.0.0,>=2.9.1 (from pinecone[asyncio]<8.0.0,>=6.0.0->langchain-pinecone)
  Downloading aiohttp_retry-2.9.1-py3-none-any.whl.metadata (8.8 kB)
Collecting tiktoken<1.0.0,>=0.7.0 (from langchain-openai>=0.3.11->langchain-pinecone)
  Downloadin

In [42]:
import os
from dotenv import load_dotenv  # ‚úÖ Added for .env support
# Load environment variables from .env file
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Pinecone as LangchainPinecone
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
import warnings
from langchain_pinecone import PineconeVectorStore

warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully")

‚úÖ All libraries imported successfully


In [27]:
print("Initializing Pinecone")
try:
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    INDEX_NAME = "chat-with-pdf"

    exsting_indexes = [idx.name for idx in pc.list_indexes()]

    if INDEX_NAME not in exsting_indexes:
        print("creating new index")
        pc.create_index(
            name = INDEX_NAME,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
                        )
        print("waiting for index to be ready...")
    else:
        print("using existing index:{INDEX_NAME}")
    
    index = pc.Index(INDEX_NAME)
except Exception as e:
    print("pinecone intialization failed")
    raise

Initializing Pinecone
creating new index
waiting for index to be ready...


In [28]:
# Load PDF document

PDF_PATH = 'F:\Gen AI\GenAI-MiniProjects\Chat With PDF/hadoop.pdf'
try:
    loader = PyPDFLoader(PDF_PATH)
    documents = loader.load()

    if not documents:
        raise ValueError("no content found in PDF")
    print(f"‚úÖ Loaded {len(documents)} pages from PDF")
    print(f"üìñ First page preview: {documents[0].page_content[:200]}...")
    
except FileNotFoundError:
    print(f"‚ùå File not found: {PDF_PATH}")
    print("üí° Please place your PDF file in the same directory or update PDF_PATH")
    raise
except Exception as e:
    print(f"‚ùå Error loading PDF: {e}")
    raise

‚úÖ Loaded 3 pages from PDF
üìñ First page preview: Hadoop  is  an  open-source  framework  for  storing  and  processing  large-scale  data  across  
distributed
 
clusters
 
using
 
commodity
 
hardware.
 
The
 
Hadoop
 
Ecosystem
 
is
 
a
 
suite
 
...


In [33]:
# Split text into Chunks

try:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len,
        separators = ["\n\n","\n",""," "]
    )
    chunks = text_splitter.split_documents(documents)


except Exception as e:
    raise

In [38]:
# Intialize EMbedding Models

embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device':'cpu'},
    encode_kwargs = {'normalize_embeddings':True}
)

#Test Embeddings

test_embed = embeddings.embed_query("test")
print(f"‚úÖ Embeddings model loaded (dimension: {len(test_embed)})")


‚úÖ Embeddings model loaded (dimension: 384)


In [43]:
# Store EMbeddings in Pinecone

vectorstore = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name = INDEX_NAME
)

print("Embedding stores successfully")

# create retriever
retriever = vectorstore.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k":3}
)

print("‚úÖ Retriever created (will fetch top 3 relevant chunks)")


Embedding stores successfully
‚úÖ Retriever created (will fetch top 3 relevant chunks)


In [44]:
# Load Language Model
hf_pipeline = pipeline(
    "text2text-generation",
    model = "google/flan-t5-small",
    max_length = 200,
    temperature = 0.7,
    do_sample=True,
    top_p=0.9
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

#test the model
test_response = llm.invoke("Hello, How are you?")
print("‚úÖ Language model loaded successfully")
print(f"üß™ Test response: {test_response[:50]}...")

Device set to use cpu


‚úÖ Language model loaded successfully
üß™ Test response: Hello, how are you?...


In [45]:
# Create Prompt Template and chain
prompt = PromptTemplate(
    input_variables=['context','question'],
    template="""Answer the question based only on the context below. If the answer is not in the context, say "I don't know based on the provided document."
    Context: {context}
    Question:{question}
    Answer:"""
    )

# Create chain using LangChain Expression Language (LCEL)
chain = prompt | llm

print("Chain created successfully")

Chain created successfully


In [48]:
# Helper function for chat

def chat_with_pdf(question):
    if not question.strip():
        return "Enter the valid question"
    
    # retrieve relevent documents
    docs = retriever.invoke(question)

    if not docs:
        return "I dont know based on the provides documents"
    
    context = "\n\n".join([doc.page_content for doc in docs])

    #general response
    response = chain.invoke({
        "context":context,
        "question":question
    })

    # clean response
    answer = response.strip() if isinstance(response, str) else str(response).strip()

    return answer

In [49]:
# Test single query
test_question = "what is the document about?"
answer = chat_with_pdf(test_question)

print(f"Question: {test_question}")
print(f"Answer: {answer}\n")

Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors


Question: what is the document about?
Answer: HBase is a NoSQL database in Hadoop ecosystem that supports all data types and handles large datasets efficiently, similar to Google‚Äôs BigTable. It is ideal for fast read/write operations on small portions of data within massive datasets. HBase offers a fault-tolerant and efficient way to store and retrieve data quickly, making it useful for real-time lookups. Other Components Apart from core components, Hadoop also includes important tools like: Solr & Lucene: Used for searching and indexing. Lucene (Java-based) offers features like spell check and Solr acts as its powerful search platform. Zookeeper: Handles coordination and synchronization between Hadoop components, ensuring consistent communication and grouping across the cluster. storage and processing: HDFS HDFS is a core component of Hadoop ecosystem, designed to store large volumes of structured or unstructured data across



In [50]:
# Interactive chat Loop
while True:
    query = input("You: ").strip()

    if query.lower() in ['exit','quit','q']:
        print("\nüëã Thank you for using Chat with PDF!")
        break

    if not query:
        print("‚ö†Ô∏è Please enter a question\n")
        continue

    # Get answer
    print("\nü§î Thinking...\n")
    answer = chat_with_pdf(query)
    print(f"Bot: {answer}\n")
    print("-" * 60 + "\n")


ü§î Thinking...

Bot: YARN (Yet Another Resource Negotiator): Manages cluster resources and job scheduling. MapReduce: A programming model for batch data processing. Hive & Pig: High-level tools for querying and analyzing large datasets. HBase: A NoSQL database for real-time read/write access. Mahout & Spark MLlib: Libraries for scalable machine learning. Solr & Lucene: Tools for full-text search and indexing. Zookeeper: Manages coordination and configuration across the cluster. Oozie: A workflow scheduler for managing Hadoop jobs. HadoopEcosystem Key Components of Hadoop Ecosystem

------------------------------------------------------------


üëã Thank you for using Chat with PDF!
