<a href="https://colab.research.google.com/github/vitchierath/NLPtasks/blob/main/pdfchatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
# Install required packages
!pip install langchain langchain-openai faiss-cpu sentence-transformers pymupdf -q

# Import libraries
from langchain_openai import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from google.colab import files
import fitz  # PyMuPDF

# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
    try:
        # Open the PDF from the uploaded bytes
        doc = fitz.open(stream=pdf_file, filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text("text") or ""
        doc.close()
        return text if text.strip() else "No text found in the PDF."
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

# Function to create a retriever from PDF content
def create_pdf_retriever(pdf_content):
    if "Error" in pdf_content or "No text found" in pdf_content:
        return None

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = text_splitter.split_text(pdf_content)
    if not docs:
        return None

    # Create documents for vector store
    split_docs = [Document(page_content=doc, metadata={"source": "uploaded_pdf"}) for doc in docs]

    # Use Hugging Face embeddings from sentence-transformers
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Create FAISS vector store
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    return vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Prompt user for OpenRouter API key
api_key = "Your Api Here!!! "

# Initialize LLM with OpenRouter
llm = ChatOpenAI(
    api_key=api_key,
    base_url="https://openrouter.ai/api/v1",
    model="mistralai/mixtral-8x7b-instruct",
    temperature=0.5,
    default_headers={
        "HTTP-Referer": "http://localhost",
        "X-Title": "PDF RAG Chatbot"
    }
)

# # Upload PDF file
print("Please upload a PDF file:")
uploaded = files.upload()
if not uploaded:
    raise ValueError("No PDF file uploaded. Please upload a file to proceed.")

# Extract text from the uploaded PDF
pdf_file = list(uploaded.values())[0]  # Get the first uploaded file
pdf_content = extract_text_from_pdf(pdf_file)

# Create retriever from PDF content
retriever = create_pdf_retriever(pdf_content)
if not retriever:
    raise ValueError("Failed to process PDF content. Check if the PDF contains extractable text.")

# Set up RetrievalQA chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

# Chat function
def chat():
    print("Welcome to the PDF RAG Chatbot!")
    print(f"Loaded PDF: {list(uploaded.keys())[0]}")
    print("Type 'exit' to quit.\n")
    while True:
        query = input("You: ")
        if query.lower() == "exit":
            print("Goodbye!")
            break
        try:
            response = rag_chain.invoke({"query": query})
            print(f"Bot: {response['result']}\n")
        except Exception as e:
            print(f"Error: {str(e)}\n")

# Start chatting
chat()

Please upload a PDF file:


Saving 01_The_Lightning_Thief.pdf to 01_The_Lightning_Thief.pdf
Welcome to the PDF RAG Chatbot!
Loaded PDF: 01_The_Lightning_Thief.pdf
Type 'exit' to quit.

You: summarize the plot

You: percy jackson father
Bot:  Percy Jackson's father is Poseidon, the Greek god of the sea. This is revealed to Percy in the course of the story when he meets Poseidon on Olympus. Poseidon acknowledges Percy as his son and gives him a package, which contains a powerful weapon that Percy will need in his future adventures. Percy's mother, Sally Jackson, is aware of her son's divine heritage and has been trying to protect him from the dangerous world of the gods. She is married to Gabe Ugliano, who is abusive to both her and Percy. The circumstances surrounding Sally's disappearance and the discovery of traces of blood in the car and at the scene of the accident suggest that she may have been the victim of foul play. Percy is determined to find out what happened to his mother and bring those responsible to 

KeyboardInterrupt: Interrupted by user

In [None]:
import fitz  # PyMuPDF for PDF processing
import faiss  # FAISS for vector storage
import numpy as np
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ------------------ Load PDF from Google Drive ------------------
pdf_path = "01_The_Lightning_Thief.pdf"  # Google Drive path

if not os.path.exists(pdf_path):
    print(f"❌ Error: File not found at {pdf_path}")
    exit()

# ------------------ Extract Text from PDF ------------------
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text("text") for page in doc)
        return text
    except Exception as e:
        print(f"❌ Error reading PDF: {e}")
        return None

# ------------------ Split Text into Chunks ------------------
def split_text(text, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_text(text)

# ------------------ Store Chunks in FAISS ------------------
def store_in_faiss(chunks):
    embeddings = model.encode(chunks, convert_to_numpy=True)  # Generate embeddings
    dim = embeddings.shape[1]  # Get embedding dimension
    index = faiss.IndexFlatL2(dim)  # Create FAISS index
    index.add(embeddings)  # Add embeddings to FAISS
    return index, chunks

# ------------------ Retrieve Answer from FAISS ------------------
def retrieve_answer(query, index, chunks):
    query_embedding = model.encode([query], convert_to_numpy=True)  # Encode query
    _, indices = index.search(query_embedding, k=1)  # Find top 1 closest match
    return chunks[indices[0][0]]  # Return the best-matching text chunk

# ------------------ Main Execution ------------------
text = extract_text_from_pdf(pdf_path)  # Extract text
if text:
    chunks = split_text(text)  # Split into chunks
    index, stored_chunks = store_in_faiss(chunks)  # Store in FAISS

    # User query for retrieval
    while True:
        query = input("\nAsk a question based on the document (or type 'exit' to quit): ")
        if query.lower() == "exit":
            print("Goodbye! 👋")
            break
        response = retrieve_answer(query, index, stored_chunks)
        print("\n📜 Most relevant answer:\n", response)


Ask a question based on the document (or type 'exit' to quit): who is percy jackson

📜 Most relevant answer:
 Ms. Jackson’s husband, Gabe Ugliano, claims that his stepson, Percy Jackson, is a
troubled child who has been kicked out of numerous boarding schools and has expressed
violent tendencies in the past.
Police would not say whether son Percy is a suspect in his mother’s disappearance, but
they have not ruled out foul play. Below are recent pictures of Sally Jackson and Percy. Police
urge anyone with information to call the following toll-free crime-stoppers hotline.
