# Retrieval Augmented Generation (RAG)

## Import libraries and secrets

In [6]:
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from dotenv import load_dotenv, find_dotenv
import ast
import os
import pandas as pd

_ = load_dotenv(find_dotenv("secrets.env"))
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

## Setup Pinecone

In [7]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "deeplearning-ai-rag"

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
	pass
else:
	pinecone.create_index(
		name=INDEX_NAME,
		dimension=1536,
		metric="cosine",
		spec=ServerlessSpec(
			cloud="aws",
			region="us-west-2", #TODO update when europe available
		),
	)

index = pinecone.Index(INDEX_NAME)

## Load the dataset

In [8]:
# # download the dataset if necessary
# !wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

# !unzip lesson2-wiki.csv.zip

# Load it
CWD = os.getcwd()
path = CWD + "/files/wiki.csv"
MAX_NUM_ARTICLES = 10000 #TODO Play with this hyperparameter later on to see the influence on the quality of the results

df = pd.read_csv(path, nrows=MAX_NUM_ARTICLES)
df.head(1)

Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."


## Prepare the embeddings

In [9]:
def uploadDataToIndex(df, index):

	# prepare a list of embeddings to upload in the index
	package = []

	# populate the index
	for i, row in tqdm(df.iterrows(), total=df.shape[0]): # iterate through the rows of the dataframe
		# fill up the package
		package.append({
			"id": row["id"],
			"values": ast.literal_eval(row["values"]),
			"metadata": ast.literal_eval(row["metadata"]),
		})

		# once the package is big enough, upload it in the index
		if len(package) >= 250:
			index.upsert(package) # upload the package
			package = [] # reset it

	if len(package) > 0:
		index.upsert(package) # upload the package
		package = [] # reset it

uploadDataToIndex(df=df, index=index)


  0%|          | 0/10000 [00:00<?, ?it/s]

## Setup openAI

In [10]:
# set up openAI client
openAI = OpenAI(api_key=OPENAI_API_KEY) # openAI client to interact with it's API


def getEmbeddings(query:str):
	'''
	Return the embedding of the query.
	'''
	model = "text-embedding-3-small"
	return openAI.embeddings.create(input=query, model=model).data[0].embedding


## Retrieve context

In [11]:
# retrieve context from index
def retrieval(query):
	'''
	return a list of 3 passages from index closest to the query
	'''
	emd = getEmbeddings(query)
	response =  index.query(top_k=3, vector=emd, include_metadata=True)
	infoList = []
	for match in response.matches:
		infoList.append(match.metadata["text"])
	return infoList

# test
query = "what is the berlin wall?"
retrieval(query=query)

['Observances\n Earliest day that Martin Luther King, Jr. Day can fall, while January 21 is the latest, on the third Monday in January (United States)\n Armed Forces Day (Nigeria)\n Army Day (India)\n Tree Planting Day (Egypt)\n\nDays of the year',
 "Observances \n Independence Day (Namibia)\n Human Rights Day (South Africa)\n Benito Juarez' birthday (Mexico)\n Spring Equinox – Northern Hemisphere\n Autumn Equinox – Southern Hemisphere\n International Day against racial discrimination (after Sharpeville massacre)\n World Poetry Day\n Youth Day (Tunisia)\n Harmony Day (Australia)\n World Down syndrome Day\n Mother's Day (Most of the Arab World)\n World Puppetry Day\n International Colour Day\n International Day of Forests\n\nDays of the year",
 "Child abuse scandals\nDuring John Paul's time as pope, the church was involved in a large number of claims about child sexual abuse by priests. There are many people who believe that the Church, and therefore the Pope, knew about these claims an

## Structure the prompt

In [12]:
def structurePrompt(query):
	'''
	Structure the prompt by adding context from our index to the query.
	'''

	# retrieve contect
	context = retrieval(query=query)

	# Structure the prompt
	promptStart = "\
	Answer the question based on the context below.\n\
	--------\n\
	Context:\n\
	--------\n"

	promptEnd = f"\
	\n--------\n\
	Question:\n\
	{query}\n\
	--------\n\
	Answer:\n"

	prompt = promptStart + f"\n--------\n".join(context) + promptEnd
	
	# return the structured prompt
	return prompt

## RAG in action

In [14]:
# Put everything together to get a working RAG
def RAG(query):
	prompt = structurePrompt(query=query)
	messages = [{
		"role": "user",
		"content": prompt
		}]
	model = "gpt-3.5-turbo-0125"
	response = openAI.chat.completions.create(messages=messages, model=model, temperature=0, )
	responseContent = response.choices[0].message.content
	responseContent = responseContent if responseContent != None else ""
	print(prompt)
	print("-"*30)
	print(responseContent)
	# return responseContent

# choose your query
query = "Who proposed the theory of general relativity?"

# see RAG in action!
RAG(query=query)

	Answer the question based on the context below.
	--------
	Context:
	--------
The team won their first trophy under Ferguson, the 1990 FA Cup, against Crystal Palace in the replay after a 3-3 draw. The next season, United won the UEFA Cup Winners Cup. In 1993, Manchester United won the very first Premier League. In 1999, Manchester United won the treble, made of the Premier League, FA Cup, and UEFA Champions League. Manchester United won the league 7 times again until Sir Alex Ferguson retired after the 2012–13 season. From the 2006–07 season to the 2008–09 season, Man United won the league 3 times in a row. The club also won its third UEFA Champions League in 2008 against Chelsea F.C., and almost won the next season as well but lost to FC Barcelona in the final. They won their first FIFA Club World Cup in December 2008.

Recent history

2013 - 2018 
David Moyes was made manager in 2013. In April 2014, he was sacked by the club and club legend Ryan Giggs became player-manager (when a 