# RAG

In this notebook, we will setup a single Retrieval Augmented Generation model on the wiki-text dataset using DSPy, Chroma DB for vector similiarity search and OPENAI API for text generation.

In [1]:
import dspy
import os

from langchain.text_splitter import SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction, SentenceTransformerEmbeddingFunction

In [2]:
os.chdir('../')

In [3]:
from src.utils import *

In [24]:
os.environ['OPENAI_API_KEY'] = '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'

In [5]:
# Load the model
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

In [6]:
# Read the text
with open('./data/raw/test.txt', 'r') as f:
    text = f.read().strip()

dspy.settings.configure(lm=turbo)

----

## ChromaDB

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=256,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text(text)

print(f"\nTotal chunks: {len(character_split_texts)}\n")


Total chunks: 7850



In [8]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(f"\nTotal chunks: {len(token_split_texts)}")




Total chunks: 7850


In [9]:
token_split_texts[1]

'robert boulter is an english film, television and theatre actor. he had a guest @ - @ starring role on the television series the bill in 2000'

In [10]:
embedding_function = SentenceTransformerEmbeddingFunction()


print("Length of embedding:")
print(len(embedding_function([token_split_texts[0]])[0]))


Length of embedding:
384


In [11]:
chroma_client = chromadb.PersistentClient("chroma.db")

In [12]:
# Create a new collection

chroma_collection = chroma_client.get_or_create_collection("test-overlap-0", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

In [13]:
chroma_collection.add(ids=ids, documents=token_split_texts)

In [14]:
chroma_client.list_collections()

[Collection(id=5608e2f9-06db-4685-91d2-6a976624f259, name=test),
 Collection(id=bed74561-5648-4f54-a1d1-6cfa23bbe2a4, name=test-overlap-0)]

In [15]:
chroma_collection.peek(1)

{'ids': ['0'],
 'embeddings': [[-0.044287629425525665,
   -0.03816476836800575,
   -0.07295431196689606,
   0.027609042823314667,
   -0.005572275258600712,
   -0.062427498400211334,
   0.06487233936786652,
   0.009721304289996624,
   0.077167809009552,
   -0.03820230811834335,
   -0.022042330354452133,
   0.01082814671099186,
   0.05214729160070419,
   -0.04285610467195511,
   -0.05621715635061264,
   0.041746363043785095,
   -0.05734526365995407,
   0.011496894992887974,
   0.010609745047986507,
   -0.010122554376721382,
   -0.08386776596307755,
   0.05410262569785118,
   -0.008147316053509712,
   0.11363305896520615,
   -0.029890311881899834,
   -0.003911106381565332,
   0.055664364248514175,
   -0.010348550975322723,
   -0.03902805596590042,
   0.026531070470809937,
   0.027695612981915474,
   -0.0019249292090535164,
   -0.00351926451548934,
   -0.039213281124830246,
   0.010268800891935825,
   0.0024343470577150583,
   0.008421522565186024,
   0.12012020498514175,
   0.024217680096

----

In [16]:
query = "Who was Robert Boulter?"

results = chroma_collection.query(query_texts=[query], n_results=2)
retrieved_documents = results['documents'][0]

print(f"Query: {query}")

print(f"\nRetrieved {len(retrieved_documents)} documents\n")

for docs in retrieved_documents:
    print(word_wrap(docs))


Query: Who was Robert Boulter?

Retrieved 2 documents

= robert boulter =
. in a review of the production for the daily telegraph, theatre critic
charles spencer noted, " robert boulter brings a touching vulnerability
to the stage as william. "


In [17]:
query = "Who was Du Fu?"

results = chroma_collection.query(query_texts=[query], n_results=2)
retrieved_documents = results['documents'][0]

print(f"Query: {query}")

print(f"\nRetrieved {len(retrieved_documents)} documents\n")

for docs in retrieved_documents:
    print(word_wrap(docs))

Query: Who was Du Fu?

Retrieved 2 documents

. du fu's conscientiousness compelled him to try to make use of it : he
caused trouble for himself by protesting the removal of his friend and
patron fang guan on a petty charge. he was arrested but was pardoned in
june
= = works = = criticism of du fu's works has focused on his strong
sense of history, his moral engagement, and his technical excellence. =
= = history = = =


In [18]:
query = "When was Robert Boulter active?"

results = chroma_collection.query(query_texts=[query], n_results=2)
retrieved_documents = results['documents'][0]

print(f"Query: {query}")

print(f"\nRetrieved {len(retrieved_documents)} documents\n")

for docs in retrieved_documents:
    print(word_wrap(docs))

Query: When was Robert Boulter active?

Retrieved 2 documents

= robert boulter =
. in a 2006 interview, fellow actor ben whishaw identified boulter as
one of his favorite co @ - @ stars : " i loved working with a guy
called robert boulter, who was in the triple bill of burn, chatroom and
citizenship at the national


In [25]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

dspy.settings.configure(lm=turbo)

In [26]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Explain with words between 1 and 5 words")

In [27]:
# Modifying the default RAG module because it doesn't work with the SentenceTransformerEmbeddingFunction
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.chroma_collection = chroma_client.get_collection("test")
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.num_passages = num_passages
    
    def forward(self, question):
        context = self.chroma_collection.query(query_texts=[question], n_results=self.num_passages)
        context = context['documents']
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [36]:
os.environ['LANGFUSE_SECRET_KEY'] = 'sk-lf-6f3542d6-53e7-4fd2-b417-e6e2fc0512a0'
os.environ['LANGFUSE_PUBLIC_KEY'] = 'pk-lf-3d36f7c6-2840-40d1-b129-63e075e24226'
os.environ["LANGFUSE_HOST"] = 'https://us.cloud.langfuse.com'

In [37]:
rag = RAG(num_passages=3)

In [38]:
question = "Who was Robert Boulter?"
rag(question)

Prediction(
    context=[['= robert boulter =', '. in a review of the production for the daily telegraph, theatre critic charles spencer noted, " robert boulter brings a touching vulnerability to the stage as william. "', '. in a 2006 interview, fellow actor ben whishaw identified boulter as one of his favorite co @ - @ stars : " i loved working with a guy called robert boulter, who was in the triple bill of burn, chatroom and citizenship at the national']],
    answer='Actor'
)

In [39]:
turbo.inspect_history(n=1)




Answer questions with short factoid answers.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: Explain with words between 1 and 5 words

---

Context: «['= robert boulter =', '. in a review of the production for the daily telegraph, theatre critic charles spencer noted, " robert boulter brings a touching vulnerability to the stage as william. "', '. in a 2006 interview, fellow actor ben whishaw identified boulter as one of his favorite co @ - @ stars : " i loved working with a guy called robert boulter, who was in the triple bill of burn, chatroom and citizenship at the national']»

Question: Who was Robert Boulter?

Reasoning: Let's think step by step in order to Answer: Actor

Answer: Actor





'\n\n\nAnswer questions with short factoid answers.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produce the answer}. We ...\n\nAnswer: Explain with words between 1 and 5 words\n\n---\n\nContext: «[\'= robert boulter =\', \'. in a review of the production for the daily telegraph, theatre critic charles spencer noted, " robert boulter brings a touching vulnerability to the stage as william. "\', \'. in a 2006 interview, fellow actor ben whishaw identified boulter as one of his favorite co @ - @ stars : " i loved working with a guy called robert boulter, who was in the triple bill of burn, chatroom and citizenship at the national\']»\n\nQuestion: Who was Robert Boulter?\n\nReasoning: Let\'s think step by step in order to Answer: Actor\n\nAnswer:\x1b[32m Actor\x1b[0m\n\n\n'

----

### Using the modified ChromaDBRM

In [40]:
from src import chromadb_rm

In [41]:
chroma_rm = chromadb_rm.ChromadbRM(collection_name="test", persist_directory="chroma.db", local_embed_model="sentence-transformers/paraphrase-MiniLM-L6-v2",
                                   openai_api_key=os.environ["OPENAI_API_KEY"])

Collection Count: 7850




config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [29]:
dspy.settings.configure(lm=turbo, rm=chroma_rm)

In [30]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [31]:
rag = RAG(num_passages=3)
question = "Who was Robert Boulter?"
rag(question)

Prediction(
    context=['= robert boulter =', 'robert boulter is an english film, television and theatre actor. he had a guest @ - @ starring role on the television series the bill in 2000. this was followed by a starring role in the play herons written by simon stephens, which was performed in 2001 at the royal court theatre. he had a guest role in the television series judge john deed in 2002', 'in 2006 boulter starred in the play citizenship written by mark ravenhill. the play was part of a series which featured different playwrights, titled burn / chatroom / citizenship. in a 2006 interview, fellow actor ben whishaw identified boulter as one of his favorite co @ - @ stars : " i loved working with a guy called robert boulter, who was in the triple bill of burn, chatroom and citizenship at the national. he played my brother in mercury fur'],
    answer='English actor'
)

In [32]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

  table = cls._concat_blocks(blocks, axis=0)


(20, 50)

In [33]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]

Bootstrapped 0 full traces after 20 examples in round 0.





In [34]:
# Ask any question you like to this simple RAG program.
my_question = "Who was Robert Boulter?"

# Get the prediction. This contains `pred.context` and `pred.answer`.
pred = compiled_rag(my_question)

# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: Who was Robert Boulter?
Predicted Answer: English actor
Retrieved Contexts (truncated): ['= robert boulter =...', 'robert boulter is an english film, television and theatre actor. he had a guest @ - @ starring role on the television series the bill in 2000. this was followed by a starring role in the play herons w...', 'in 2006 boulter starred in the play citizenship written by mark ravenhill. the play was part of a series which featured different playwrights, titled burn / chatroom / citizenship. in a 2006 interview...']


In [35]:
turbo.inspect_history(n=1)





Answer questions with short factoid answers.

---

Question: Which magazine has published articles by Scott Shaw, Tae Kwon Do Times or Southwest Art?
Answer: Tae Kwon Do Times

Question: This American guitarist best known for her work with the Iron Maidens is an ancestor of a composer who was known as what?
Answer: The Waltz King

Question: On the coast of what ocean is the birthplace of Diogal Sakho?
Answer: Atlantic

Question: The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?
Answer: 1950

Question: The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas founded in what year?
Answer: 2010

Question: Which company distributed this 1977 American animated film produced by Walt Disney Productions for which Sherman Brothers wrote songs?
Answer: Buena Vista Distribution

Question: Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field

In [36]:
for name, parameter in compiled_rag.named_predictors():
    print(name)
    print(parameter.demos[0])
    print()

generate_answer
Example({'question': 'Which magazine has published articles by Scott Shaw, Tae Kwon Do Times or Southwest Art?', 'answer': 'Tae Kwon Do Times'}) (input_keys={'question'})



----