In [None]:
# The following packages are needed for this project 
# --------------------------------------------------------------------------------------------------------------
# langchain_community.document_loaders:  Contains PyPDFLoader which is essential for 
#                                        loading and processing PDF documents.
# --------------------------------------------------------------------------------------------------------------
# langchain.text_splitter:               Provides RecursiveCharacterTextSplitter, useful for 
#                                        splitting text into manageable chunks for processing.
# --------------------------------------------------------------------------------------------------------------
# langchain_community.vectorstores:      Includes FAISS, a library for efficient similarity 
#                                        search and clustering of dense vectors, necessary 
#                                        for building vector stores.
# --------------------------------------------------------------------------------------------------------------
# langchain_openai:                      Provides OpenAIEmbeddings for generating embeddings 
#                                        using OpenAI's models, useful in vector search and 
#                                        retrieval tasks.
# --------------------------------------------------------------------------------------------------------------
# langchain.chains:                      Contains RetrievalQA, a module for creating question 
#                                        and answer chains based on retrieval methods.
# --------------------------------------------------------------------------------------------------------------
# langchain.prompts:                     Includes ChatPromptTemplate, useful for structuring 
#                                        and formatting prompts for chat models.
# --------------------------------------------------------------------------------------------------------------
# langchain_core.output_parsers:         Provides StrOutputParser, necessary for parsing 
#                                        string outputs from various operations.
# --------------------------------------------------------------------------------------------------------------
# langchain_core.runnables:              Contains RunnablePassthrough, a utility for handling 
#                                        pass-through operations in workflows.
# --------------------------------------------------------------------------------------------------------------
!pip install langchain_community langchain langchain_openai

In [2]:
# Importing the previously presented packages
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

import os



In [6]:
openai_api_key = 'ADD YOUR KEY'

In [7]:
# Extracts and splits text from a PDF file into chunks of specified size and overlap.
def extract_splits_from_pdf(pdf_path):
    loader = PyPDFLoader("documents/Article-1.pdf")
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)   
    return splits

# Processes all PDF files in a folder, extracting and splitting text from each.
def process_pdfs(pdf_folder):
    all_splits = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            single_doc_splits = extract_splits_from_pdf(pdf_path)
            all_splits.append(single_doc_splits)
    all_splits_flattened = [item for sublist in all_splits for item in sublist]
    return all_splits_flattened

# Stores text splits in a FAISS vector store using OpenAI embeddings.
def store_splits_in_faiss(splits, openai_api_key):
    DB_FAISS_PATH = 'vectorstore/db_faiss'
    embedder = OpenAIEmbeddings(api_key = openai_api_key)
    vectorstore = FAISS.from_documents(documents = splits, embedding = embedder)
    vectorstore.save_local(DB_FAISS_PATH)

In [9]:
pdf_folder = "documents/" 
all_splits = process_pdfs(pdf_folder)
store_splits_in_faiss(all_splits, openai_api_key)

In [10]:
# Loads the knowledge base from a local FAISS vector store using OpenAI embeddings.
def load_knowledgeBase(openai_api_key):
    embeddings=OpenAIEmbeddings(api_key = openai_api_key)
    DB_FAISS_PATH = 'vectorstore/db_faiss'
    db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
    return db

#  Loads the OpenAI language model (GPT-3.5-turbo) with specified parameters.
def load_llm(openai_api_key):
        from langchain_openai import ChatOpenAI
        llm = ChatOpenAI(model_name="gpt-3.5-turbo", 
                         temperature=0, max_tokens=500,
                         api_key = openai_api_key)
        return llm

# Creates a prompt template for answering questions based on provided context.
def load_prompt():
        prompt = """ You need to answer the question in the sentence as same as in the  pdf content. 
        Given below is the context and question of the user.
        context = {context}
        question = {question}
        if the answer is not in the pdf answer "I am sorry but based on the provided information I can't answer you your question."
         """
        prompt = ChatPromptTemplate.from_template(prompt)
        return prompt

# Formats the document content by joining page contents with newline separators.
def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

In [11]:
knowledgeBase = load_knowledgeBase(openai_api_key)
llm = load_llm(openai_api_key)
prompt = load_prompt()

In [16]:
query = "Who is Christophe Atten?"

In [17]:
# Searches the knowledge base for documents similar to the query using the similarity 
# search method.
similar_embeddings = knowledgeBase.similarity_search(query)

# Converts the retrieved similar documents into embeddings using OpenAI embeddings to 
# ensure they are ready for further processing or storage.
similar_embeddings = FAISS.from_documents(documents=similar_embeddings, embedding=OpenAIEmbeddings(api_key=openai_api_key))

In [18]:
# Converts the similar embeddings into a retriever object to facilitate efficient 
# querying and retrieval of relevant documents.
retriever = similar_embeddings.as_retriever()

In [19]:
# Creates a Retrieval-Augmented Generation (RAG) chain by defining the data flow:
# - "context" is retrieved and formatted from the retriever, 
# - "question" is passed through as-is,
# - The prompt template is applied,
# - The LLM generates a response,
# - The output is parsed into a string format.
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Invokes the RAG chain with the query to generate a response.
response = rag_chain.invoke(query)
print(response)

I am sorry but based on the provided information I can't answer your question.
