# QA Bot for PDF Files

In [3]:
!pip install PyPDF2
!pip install requests
!pip install langchain
!pip install langchain-google-genai
!pip install langchain-pinecone

Collecting langchain-pinecone
  Downloading langchain_pinecone-0.1.1-py3-none-any.whl (8.4 kB)
Collecting pinecone-client<4.0.0,>=3.2.2 (from langchain-pinecone)
  Downloading pinecone_client-3.2.2-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client, langchain-pinecone
  Attempting uninstall: pinecone-client
    Found existing installation: pinecone-client 4.1.2
    Uninstalling pinecone-client-4.1.2:
      Successfully uninstalled pinecone-client-4.1.2
Successfully installed langchain-pinecone-0.1.1 pinecone-client-3.2.2


In [7]:
# Import necessary libraries
import PyPDF2
import pinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferWindowMemory
from langchain_pinecone import Pinecone

In [22]:
import os
os.environ["PINECONE_API_KEY"] = "413459c2-3c2b-45f3-a1ee-aff77e8e58d5"
os.environ["GOOGLE_API_KEY"] = 'AIzaSyDSqBRDH6coHZy-BTv055X1sTnM9GYWdcw'

In [23]:
import os

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
gemini_api_key =  os.environ.get("GOOGLE_API_KEY")

import time

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)

In [17]:
# Initialize Pinecone

# Create a new Pinecone index
import time

index_name = "nagp"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [18]:
# Initialize Conversation Buffer Window Memory
memory = ConversationBufferWindowMemory(window_size=5)

In [37]:
# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    pdf = PyPDF2.PdfReader(file_path)
    text = ""
    for page_num in range(len(pdf.pages)):
        text += pdf.pages[page_num].extract_text()
    return text

In [20]:
# Function to chunk text using RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
    chunks = text_splitter.split_text(text)
    return chunks

In [25]:
# Initialize Gemini embeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [26]:
# Function to generate embeddings using Gemini API for text chunks
def generate_embeddings(text_chunks, embeddings_model):
    embeddings = [embeddings_model.embed(chunk) for chunk in text_chunks]
    return embeddings

In [27]:
# Function to store embeddings in Pinecone
def store_embeddings_in_pinecone(embeddings, text_chunks):
    for i, (embedding, chunk) in enumerate(zip(embeddings, text_chunks)):
        index.upsert([(str(i), embedding, {'text': chunk})])

In [28]:
# Function to query Pinecone
def query_pinecone(query_embedding, top_k=5):
    return index.query(query_embedding, top_k=top_k, include_metadata=True)

In [29]:
# Function to refine user query using Gemini API and maintain context
def refine_query(user_query, memory, api_key):
    # Append user query to memory
    memory.add_message({'role': 'user', 'content': user_query})
    conversation_context = " ".join([message['content'] for message in memory.buffer])

    chat_template = ChatPromptTemplate.from_template("{conversation}")
    llm_chain = LLMChain(prompt=chat_template, api_key=api_key)
    refined_query = llm_chain.run(conversation=conversation_context)
    return refined_query

In [30]:
# Function to process results with Gemini API
def process_results_with_gemini(results, api_key):
    result_texts = [result['metadata']['text'] for result in results['matches']]
    combined_text = " ".join(result_texts)
    chat_template = ChatPromptTemplate.from_template("{text}")
    llm_chain = LLMChain(prompt=chat_template, api_key=api_key)
    processed_text = llm_chain.run(text=combined_text)
    return processed_text

In [31]:
# Function to handle a user query
def handle_user_query(user_query, api_key):
    refined_query = refine_query(user_query, memory, api_key)
    query_embedding = gemini_embeddings.embed(refined_query)
    results = query_pinecone(query_embedding)
    processed_results = process_results_with_gemini(results, api_key)
    return processed_results

In [38]:
# Main workflow
# Extract text from a sample PDF
pdf_text = extract_text_from_pdf('AssignmentSupportDocument.pdf')

# Chunk the extracted text
text_chunks = chunk_text(pdf_text)

# Generate embeddings for the text chunks
embeddings = generate_embeddings(text_chunks, gemini_embeddings)

# Store embeddings in Pinecone
store_embeddings_in_pinecone(embeddings, text_chunks)

# Example user query
user_query = "What is the main topic of the document?"

# Handle the user query and retrieve results
results = handle_user_query(user_query, gemini_api_key)
print(results)

AttributeError: 'GoogleGenerativeAIEmbeddings' object has no attribute 'embed'