In [None]:
import os
import PyPDF2
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

embedding_model = OpenAIEmbeddings()

def create_embeddings(chunks):
    embeddings = [embedding_model.embed_query(chunk) for chunk in chunks]
    return embeddings

def store_embeddings_in_faiss(embeddings, chunks):
    vector_db = FAISS(embeddings, chunks)
    return vector_db

def similarity_search(query, vector_db):
    query_embedding = embedding_model.embed_query(query)
    results = vector_db.similarity_search(query_embedding, k=5)
    return results

def initialize_llm():
    llm = OpenAI(model_name="gpt-4")
    return llm

def generate_response(llm, query, retrieved_chunks):
    prompt_template = PromptTemplate(
        input_variables=["query", "retrieved_chunks"],
        template="""
        User Query: {query}
        Relevant Information: {retrieved_chunks}
        Answer the query based on the above context with exact details.
        """
    )
    context = ' '.join(retrieved_chunks)
    prompt = prompt_template.format(query=query, retrieved_chunks=context)
    response = llm(prompt)
    return response

def main():
    pdf_paths = input("Enter the paths of PDF files, separated by commas: ").split(',')
    user_query = input("Enter your query: ")

    all_chunks = []
    chunk_mapping = {}

    for pdf_path in pdf_paths:
        text = extract_text_from_pdf(pdf_path.strip())
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        chunk_mapping.update({chunk: pdf_path for chunk in chunks})

    embeddings = create_embeddings(all_chunks)
    vector_db = store_embeddings_in_faiss(embeddings, all_chunks)

    retrieved_chunks = similarity_search(user_query, vector_db)
    retrieved_texts = [chunk[1] for chunk in retrieved_chunks]
  
    llm = initialize_llm()
    response = generate_response(llm, user_query, retrieved_texts)

    print("Response:", response)

if __name__ == "__main__":
    main()
