# QA Bot for PDF Files

In [None]:
!pip install PyPDF2
!pip install requests
!pip install pinecone-client
!pip install langchain

In [1]:
# Import necessary libraries
import PyPDF2
import pinecone
from langchain.embeddings import GeminiEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferWindowMemory

In [2]:
# Initialize Pinecone
pinecone.init(api_key='your-pinecone-api-key')

# Create a new Pinecone index
index_name = 'pdf-embeddings'
pinecone.create_index(index_name, dimension=768)
index = pinecone.Index(index_name)

In [3]:
# Initialize Conversation Buffer Window Memory
memory = ConversationBufferWindowMemory(window_size=5)

In [4]:
# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    pdf = PyPDF2.PdfFileReader(file_path)
    text = ""
    for page_num in range(pdf.getNumPages()):
        text += pdf.getPage(page_num).extract_text()
    return text

In [5]:
# Function to chunk text using RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
    chunks = text_splitter.split_text(text)
    return chunks

In [6]:
# Initialize Gemini embeddings
gemini_api_key = 'your-gemini-api-key'
gemini_embeddings = GeminiEmbeddings(api_key=gemini_api_key)

In [7]:
# Function to generate embeddings using Gemini API for text chunks
def generate_embeddings(text_chunks, embeddings_model):
    embeddings = [embeddings_model.embed(chunk) for chunk in text_chunks]
    return embeddings

In [8]:
# Function to store embeddings in Pinecone
def store_embeddings_in_pinecone(embeddings, text_chunks):
    for i, (embedding, chunk) in enumerate(zip(embeddings, text_chunks)):
        index.upsert([(str(i), embedding, {'text': chunk})])

In [9]:
# Function to query Pinecone
def query_pinecone(query_embedding, top_k=5):
    return index.query(query_embedding, top_k=top_k, include_metadata=True)

In [10]:
# Function to refine user query using Gemini API and maintain context
def refine_query(user_query, memory, api_key):
    # Append user query to memory
    memory.add_message({'role': 'user', 'content': user_query})
    conversation_context = " ".join([message['content'] for message in memory.buffer])
    
    chat_template = ChatPromptTemplate.from_template("{conversation}")
    llm_chain = LLMChain(prompt=chat_template, api_key=api_key)
    refined_query = llm_chain.run(conversation=conversation_context)
    return refined_query

In [11]:
# Function to process results with Gemini API
def process_results_with_gemini(results, api_key):
    result_texts = [result['metadata']['text'] for result in results['matches']]
    combined_text = " ".join(result_texts)
    chat_template = ChatPromptTemplate.from_template("{text}")
    llm_chain = LLMChain(prompt=chat_template, api_key=api_key)
    processed_text = llm_chain.run(text=combined_text)
    return processed_text

In [12]:
# Function to handle a user query
def handle_user_query(user_query, api_key):
    refined_query = refine_query(user_query, memory, api_key)
    query_embedding = gemini_embeddings.embed(refined_query)
    results = query_pinecone(query_embedding)
    processed_results = process_results_with_gemini(results, api_key)
    return processed_results

In [13]:
# Main workflow
# Extract text from a sample PDF
pdf_text = extract_text_from_pdf('sample.pdf')

# Chunk the extracted text
text_chunks = chunk_text(pdf_text)

# Generate embeddings for the text chunks
embeddings = generate_embeddings(text_chunks, gemini_embeddings)

# Store embeddings in Pinecone
store_embeddings_in_pinecone(embeddings, text_chunks)

# Example user query
user_query = "What is the main topic of the document?"

# Handle the user query and retrieve results
results = handle_user_query(user_query, gemini_api_key)
print(results)