## Let's see if we can implement RAG!

#### We are going to build a conversational chatbot using the OpenAI API. We will first load a pre-trained model, which references contextual documents via RAG before giving an answer.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install the necessary libraries
!pip install langchain
!pip install openai
!pip install tiktoken
!pip install faiss-gpu
!pip install langchain_experimental
!pip install "langchain[docarray]"
!pip install -U langchain-openai

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.11-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.4/337.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.83-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsm

In [None]:
# import the necessary libraries

# document loader for loading data from source
from langchain.document_loaders import TextLoader
# text splitter for chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Facebook AI similarity search, FAISS, vector store
# efficient similarity search and clustering amongst our embeddings
from langchain_community.vectorstores import FAISS
# allows us to work with OpenAI chat models
from langchain_openai import ChatOpenAI
# allows for the storing of conversational history
from langchain.memory import ConversationBufferMemory
# support for follow-up questions in a RAG chain
from langchain.chains import ConversationalRetrievalChain
# import the PyPDFLoader library for loading and splitting PDF-formatted documents
from langchain.document_loaders import PyPDFLoader
# module whose associated function allows us to search for files that match a specific file
# pattern or name
import glob
# import the embeddings model for our vector store
from langchain_openai import OpenAIEmbeddings

In [None]:
# import the os module, read in our openAI API key
import os

os.environ["OPENAI_API_KEY"] = 'sk-proj-lEZcYVcnDYbUcSFaeVFYT3BlbkFJCo6RdH02kBdqbkdFCky5'

In [None]:
# STEP 1: DATA LOADING
# Let's prepare our data first
# upload the RPA files, naming the entire dataset "rpa-docs"
# copy the filepath for the root directory and save it as follows
# remember to add the forward slash at the end
#data_root = "/kaggle/input/rpa-docs/"
data_root = "/content/drive/MyDrive/LLM_RAG/PDF/"

In [None]:
# read in all the filenames
filenames = glob.glob(data_root + "*.pdf")
filenames

['/content/drive/MyDrive/LLM_RAG/PDF/D2CP05465E.pdf',
 '/content/drive/MyDrive/LLM_RAG/PDF/1-s2.0-S0167732223001885-main.pdf',
 '/content/drive/MyDrive/LLM_RAG/PDF/ChemBioChem - 2024 - Wang - ATOMISTIC CHARACTERIZATION OF HEALTHY AND DAMAGED HAIR SURFACES  A MOLECULAR DYNAMICS STUDY OF.pdf',
 '/content/drive/MyDrive/LLM_RAG/PDF/sanders-et-al-2023-exploring-the-effects-of-wetting-and-free-fatty-acid-deposition-on-an-atomistic-hair-fiber-surface.pdf']

Install pypdf for retrieve pdf data

In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m204.8/290.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


In [None]:
# create an empty list to store the contents of the PDF files
documents = []

# cycle through each filepath
for file in filenames:
    # create a PyPDFLoader object for the current file
    loader = PyPDFLoader(file)
    # make use of the load() method to load in the contents of the file
    document = loader.load()
    # add the contents to our list
    documents += document

In [None]:
# check the size of our list
# we have 11, i.e. the contents of each page is extracted as an element in the list
# we can check for ourselves what's the total page count across the four files
len(documents) # Number of pages

71

In [None]:
# check the result by accessing the contents of the first file
documents[0]

Document(metadata={'source': '/content/drive/MyDrive/LLM_RAG/PDF/D2CP05465E.pdf', 'page': 0}, page_content='1768 |  Phys. Chem. Chem. Phys., 2023, 25, 1768–1780 This journal is © the Owner Societies 2023\nCite this: Phys. Chem. Chem. Phys.,\n2023, 25, 1768Shearing friction behaviour of synthetic polymers\ncompared to a functionalized polysaccharide onbiomimetic surfaces: models for the prediction\nof performance of eco-designed formulations †\nBenjamin J. Coscia,\naJohn C. Shelley,\naAndrea R. Browning,a\nJeﬀrey M. Sanders,\nbRobin Chaudret,cRoger Rozot,\ndFabien Le ´onforte,\n *d\nMathew D. Halls\neand Gustavo S. Luengo\n *d\nThe substitution of natural, bio-based and/or biodegradable polymers for those of petrochemical origin\nin consumer formulations has become an active area of research and development as the sourcing anddestiny of material components becomes a more critical factor in product design. These polymers oftendiﬀer from their petroleum-based counterparts in topology, raw

In [None]:
# STEP 2: CHUNKING
# now that we have loaded in the external data store, we chunk the documents
# the purpose of chunking is that the documents, in their original states, are too long
# to fit into the LLM's context window, so we need to chunk them into smaller pieces
# LangChain comes with many text splitters for this purpose

# create our text splitter object, specifying that we will be chunking into bits that are
# 512 characters in size, with an overlap of 100 characters between chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 512, chunk_overlap = 100,)  ## optimize chunk_size

# apply the text splitter object to our documents, chunking them according to the parameters
# previously specified using the split_documents() method
data = text_splitter.split_documents(documents)

In [None]:
# let's check the length to see the number of chunks that has resulted
len(data)

689

In [None]:
# check the result of the first chunk
data[0]

Document(metadata={'source': '/content/drive/MyDrive/LLM_RAG/PDF/D2CP05465E.pdf', 'page': 0}, page_content='1768 |  Phys. Chem. Chem. Phys., 2023, 25, 1768–1780 This journal is © the Owner Societies 2023\nCite this: Phys. Chem. Chem. Phys.,\n2023, 25, 1768Shearing friction behaviour of synthetic polymers\ncompared to a functionalized polysaccharide onbiomimetic surfaces: models for the prediction\nof performance of eco-designed formulations †\nBenjamin J. Coscia,\naJohn C. Shelley,\naAndrea R. Browning,a\nJeﬀrey M. Sanders,\nbRobin Chaudret,cRoger Rozot,\ndFabien Le ´onforte,\n *d\nMathew D. Halls\neand Gustavo S. Luengo\n *d')

In [None]:
# STEP 3: STORING
# after chunking, to enable semantic search across the text chunks, we need to generate the vector
# embeddings for each chunk, and then store these embeddings

# here we define our embeddings model, to generate the embeddings
embeddings = OpenAIEmbeddings()

In [None]:
# convert our chunked documents into vector embeddings, and load the embeddings into the
# FAISS vector store
vectorstore = FAISS.from_documents(data, embedding = embeddings)

In [None]:
# STEP 4: DOCUMENT RETRIEVAL
# run the code below to use our vector store as a retriever
# this enables us to fetch additional context based on the semantic similarity between user query
# and the chunk embeddings
retriever = vectorstore.as_retriever()

In [None]:
# import the ChatPromptTemplate class, allowing us to create flexible templated prompts
from langchain.prompts import ChatPromptTemplate

# to augment the question from the user with the additional context (to form our augmented prompt),
# we need to prepare a prompt template; the template can readily be customisable as shown
# by the following lines of code

# here, we specify the template such that a combination of both the context from our vector store
# and the existing knowledge base of the model can be used to answer the question
# rather than just one or the other
template = """You are a scientist for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If the context is not relevant,please answer the question by using your own knowledge about the topic

Context: {context}

Question: {question}
"""
# load in our template above to create our templated prompt
prompt = ChatPromptTemplate.from_template(template)
print(prompt)

# Here the input to the prompt we specified above is expected to be a map with
# keys “context” and “question”

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='You are a scientist for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf the context is not relevant,please answer the question by using your own knowledge about the topic\n\nContext: {context}\n\nQuestion: {question}\n'))]


In [None]:
# import the class that allows us to pass on the user's input to the model unchanged
from langchain_core.runnables import RunnablePassthrough
# converts the output from our LLM into a string format
from langchain.schema.output_parser import StrOutputParser

# specify our LLM using the ChatOpenAI class we imported earlier
# gpt-3.5-turbo is more cost efficient than gpt-4
# setting temperature = 0 results in more deterministic responses
# we can fiddle around with this parameter (from 0 to 2) to experiment with the responses we get
# an alternative (commented out) has been included for our experimentation
llm = ChatOpenAI(model_name = "gpt-3.5-turbo", temperature = 0)
# llm = ChatOpenAI(model_name="gpt-4", temperature=0.7, )

# now we can build a chain for the RAG pipeline, chaining together the retriever, the prompt template
# and the LLM
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# as mentioned above, the input to the prompt we specified earlier is expected to be a map with
# keys “context” and “question”
# "question" is just the user input; thus we just need to get the context using our retriever and
# passthrough the user input under the “question” key
# In this case, RunnablePassthrough() allows us to pass on the user’s question to the prompt

# once our RAG chain is defined, we can invoke it
# here, the vector store will be queried by the user prompt and the most relevant
# data will be retrieved to augment the prompt, and used to call the model
# the model will now respond based on the augmented prompt
query = "Is the contribution of Coulombic interaction energies larger than LJ in total energy between FMEA and protein?"
rag_chain.invoke(query)

# you can see that the response is taken within context; check with the document
# "RPA-Developer-Foundation-Training.pdf"

'Yes, the contribution of Coulombic interaction energies is larger than LJ in total energy between FMEA and protein. This is mainly due to the proximity of the polar head group with the amino acids on the protein surface.'

In [None]:
query = "what is the Thickness of FMEA in Wet condition"
rag_chain.invoke(query)

'The thickness of FMEA in wet condition is visibly larger compared to dry condition, as the fatty acid layers take on an extended conformation to repel water. The transition from thicker (~3nm) to thinner (~1nm) fatty acid layers is demonstrated with the depletion of FMEA, which influences the water penetration.'

re-ranking of the retrieval results

In [None]:
# let's contrast the results when we introduce re-ranking of the retrieval results from the vector store
# here, we specify that we will use the MMR re-ranking algorithm
# the top three results will be retrieved (k = 3)
# and we set a diversity factor, lambda_mult, whose values range between 0 and 1
# where 0 is maximum diversity and 1 is minimum diversity
# here, we set lambda_mult = 0.1 for a relatively high degree of diversity in the results

retriever = vectorstore.as_retriever(
search_type = 'mmr',  # re-ranking algorithm
search_kwargs = {'k':3, 'lamba_mult': 0.1})

# rebuild the RAG chain with the redefined retriever, and invoke it to answer our query
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# checking against the source PDF, we see that the answer provided is not the most direct
# one as given in the source
query = "Is the contribution of Coulombic interaction energies larger than LJ in total energy between FMEA and protein?"
rag_chain.invoke(query)

'Yes, according to the retrieved context, the contribution of Coulombic interaction energies is larger than LJ in total energy between FMEA and protein surfaces. This is mainly due to the proximity of the polar head group with the amino acids on the protein surface.'

In [None]:
# the model can also answer out of context questions
# here, because there's no relevant result from the vector store
# the original prompt will be used to call the model directly
# from which the response will be given based on the existing knowledge base of the model
query = "Who won the Chemistry Nobel prize in 2009?"
rag_chain.invoke(query)

# Let's return to our slides

'The Chemistry Nobel Prize in 2009 was awarded to Venkatraman Ramakrishnan, Thomas A. Steitz, and Ada E. Yonath for their studies on the structure and function of the ribosome.'

Acknowledgements: This notebook was adapted from the following sources:

https://scalexi.medium.com/implementing-a-retrieval-augmented-generation-rag-system-with-openais-api-using-langchain-ab39b60b4d9f

https://towardsdatascience.com/retrieval-augmented-generation-rag-from-theory-to-langchain-implementation-4e9bd5f6a4f2