# Retrieval Augmented Generation (RAG)

## Import libraries and secrets

In [22]:
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from dotenv import load_dotenv, find_dotenv
import ast
import os
import pandas as pd

_ = load_dotenv(find_dotenv("secrets.env"))
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

## Setup Pinecone

In [5]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "deeplearning-ai-rag"

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
	pass
else:
	pinecone.create_index(
		name=INDEX_NAME,
		dimension=1536,
		metric="cosine",
		spec=ServerlessSpec(
			cloud="aws",
			region="us-west-2", #TODO update when europe available
		),
	)

index = pinecone.Index(INDEX_NAME)

## Load the dataset

In [86]:
# download the dataset if necessary
# !wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

# !unzip lesson2-wiki.csv.zip

# Load it
CWD = os.getcwd()
path = CWD + "/files/wiki.csv"
MAX_NUM_ARTICLES = 10000 #TODO Play with this hyperparameter later on to see the influence on the quality of the results

df = pd.read_csv(path, nrows=MAX_NUM_ARTICLES)
df.head(1)

Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."


## Prepare the embeddings

In [87]:
def uploadDataToIndex(df, index):

	# prepare a list of embeddings to upload in the index
	package = []

	# populate the index
	for i, row in tqdm(df.iterrows(), total=df.shape[0]): # iterate through the rows of the dataframe
		# fill up the package
		package.append({
			"id": row["id"],
			"values": ast.literal_eval(row["values"]),
			"metadata": ast.literal_eval(row["metadata"]),
		})

		# once the package is big enough, upload it in the index
		if len(package) >= 250:
			index.upsert(package) # upload the package
			package = [] # reset it

	if len(package) > 0:
		index.upsert(package) # upload the package
		package = [] # reset it

uploadDataToIndex(df=df, index=index)


  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [07:04<00:00, 23.56it/s]


## Setup openAI

In [88]:
# set up openAI client
openAI = OpenAI(api_key=OPENAI_API_KEY) # openAI client to interact with it's API


def getEmbeddings(query:str):
	'''
	Return the embedding of the query.
	'''
	model = "text-embedding-3-small"
	return openAI.embeddings.create(input=query, model=model).data[0].embedding


## Retrieve context

In [89]:
# retrieve context from index
def retrieval(query):
	'''
	return a list of 3 passages from index closest to the query
	'''
	emd = getEmbeddings(query)
	response =  index.query(top_k=3, vector=emd, include_metadata=True)
	infoList = []
	for match in response.matches:
		infoList.append(match.metadata["text"])
	return infoList

# test
query = "what is the berlin wall?"
retrieval(query=query)

['Killing a living thing is when someone or something ends that life and makes the living thing die. It means causing a death. When a human being kills another human being, it is called murder or homicide, such as manslaughter.  \n\nPesticides and herbicides are poisons for killing bad wild small animals or plants, respectively.\n\nWhen a soldier kills another in war, it is called "combat".  When the state kills a convict sentenced to capital punishment, it is called execution. When someone kills a  powerful person it is called assassination. When a person who wants to die kills himself it is suicide, or euthanasia if killed by another. When people kill other people to eat them, it is called cannibalism.\n\nRelated pages \n Cain and Abel\n\nCrime',
 'Observances\n Earliest day that Martin Luther King, Jr. Day can fall, while January 21 is the latest, on the third Monday in January (United States)\n Armed Forces Day (Nigeria)\n Army Day (India)\n Tree Planting Day (Egypt)\n\nDays of the

## Structure the prompt

In [90]:
def structurePrompt(query):
	'''
	Structure the prompt by adding context from our index to the query.
	'''

	# retrieve contect
	context = retrieval(query=query)

	# Structure the prompt
	promptStart = "\
	Answer the question based on the context below.\n\
	--------\n\
	Context:\n\
	--------\n"

	promptEnd = f"\
	\n--------\n\
	Question:\n\
	{query}\n\
	--------\n\
	Answer:\n"

	prompt = promptStart + f"\n--------\n".join(context) + promptEnd
	
	# return the structured prompt
	return prompt

## RAG in action

In [93]:
# Put everything together to get a working RAG
def RAG(query):
	prompt = structurePrompt(query=query)
	messages = [{
		"role": "user",
		"content": prompt
		}]
	model = "gpt-3.5-turbo-0125"
	response = openAI.chat.completions.create(messages=messages, model=model, temperature=0, )
	responseContent = response.choices[0].message.content
	responseContent = responseContent if responseContent != None else ""
	return responseContent

# choose your query
query = "What are colors?"

# see RAG in action!
RAG(query=query)

'Colors are visual perceptions that are created by the way light interacts with objects. They are typically described in terms of their hue, saturation, and brightness.'