In [1]:
from dspy.datasets import HotPotQA
import dspy
dataset = HotPotQA(train_seed=1, train_size=1000, eval_seed=0, test_size=0)


  table = cls._concat_blocks(blocks, axis=0)


In [2]:
# Tell DSPy that the `question` field is the input. Any other fields are labels.
dataset = [x.with_inputs('question') for x in dataset.train]
print(len(dataset))

1000


In [3]:
print(dataset[0])

Example({'question': 'At My Window was released by which American singer-songwriter?', 'answer': 'John Townes Van Zandt'}) (input_keys={'question'})


In [4]:
from dspy.retrieve.chromadb_rm import ChromadbRM
import chromadb
from chromadb.utils import embedding_functions


chroma_client = chromadb.PersistentClient(path='./hotpotqa')
default_ef = embedding_functions.DefaultEmbeddingFunction()

docs = [x.question + " -> " + x.answer for x in dataset]
ids = [f"id{i}" for i in range(len(docs))]
collection = chroma_client.get_or_create_collection(name='hotpotqa', embedding_function=default_ef)

In [5]:
collection.add(
    documents=docs,
    ids=ids
)

Add of existing embedding ID: id0
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id3
Add of existing embedding ID: id4
Add of existing embedding ID: id5
Add of existing embedding ID: id6
Add of existing embedding ID: id7
Add of existing embedding ID: id8
Add of existing embedding ID: id9
Add of existing embedding ID: id10
Add of existing embedding ID: id11
Add of existing embedding ID: id12
Add of existing embedding ID: id13
Add of existing embedding ID: id14
Add of existing embedding ID: id15
Add of existing embedding ID: id16
Add of existing embedding ID: id17
Add of existing embedding ID: id18
Add of existing embedding ID: id19
Add of existing embedding ID: id20
Add of existing embedding ID: id21
Add of existing embedding ID: id22
Add of existing embedding ID: id23
Add of existing embedding ID: id24
Add of existing embedding ID: id25
Add of existing embedding ID: id26
Add of existing embedding ID: id27
Add of existing embedding ID: 

In [6]:
retrieval_model = ChromadbRM(
    collection_name='hotpotqa',
    persist_directory='./hotpotqa',
    embedding_function=default_ef,
    k=3,
    client=chroma_client
)

In [49]:
print(retrieval_model("Woozles"))

[{'id': 'id860', 'score': 1.1315628290176392, 'long_text': '"Heffalumps and Woozles" is a song from a full-length animated film distributed by who? -> Buena Vista Distribution', 'metadatas': None}, {'id': 'id291', 'score': 1.6422317028045654, 'long_text': 'Are both Jimmie Ross and Jang Hyun-seung Americans? -> no', 'metadatas': None}, {'id': 'id905', 'score': 1.652337670326233, 'long_text': 'What Boo Ji-Young film was about employees of a retail supermarket who band together when the contract workers are laid off? -> Cart', 'metadatas': None}]


In [8]:
ollama_model = dspy.OllamaLocal(
    model='phi3',
    model_type='text',
    max_tokens=350,
    temperature=0.7,
    top_p=0.9,
    frequency_penalty=1.17,
    top_k=40
)

In [9]:
ollama_model("Tell me about the weather on jupiter?")

[' Jupiter, being a gas giant, does not have a solid surface like Earth. Instead, it has layers of clouds composed mainly of hydrogen compounds such as ammonia and water ice. The "weather" on Jupiter is fascinating and quite extreme compared to anything we experience here on our home planet.\n\n1. Stratosphere: Above the dense cloud layer lies a stratosphere, where temperature actually increases with altitude due to absorption of ultraviolet radiation by methane in this region. This creates an unstable environment that can lead to turbulence and strong winds when mixed with other atmospheric layers below it.\n\n2. Troposphere: The lowest layer, the troposphere, is where most weather activity occurs on Jupiter like storm systems and cloud formations are found here as well due to convection currents driven by internal heat from the planet\'s core (~180 million Kelvin). \n\n3. Storm Systems: One of the most prominent features in Jovian weather is its massive, persistent storm system known

In [74]:
# Get Top passages

In [10]:
dspy.settings.configure(rm=retrieval_model, lm=ollama_model)

In [11]:
dev_example = dataset[100]
dev_example

Example({'question': "What was a previous unoffical name for the high performance variant of Audi's compact executive car?", 'answer': 'Audi Ur-S4'}) (input_keys={'question'})

In [50]:
def get_top_passages(question):
    retrieve = dspy.Retrieve(k=3)
    topK_passages = retrieve(question, 3).passages
    print(f"Top {retrieve.k} passages for question : {question} \n", '-'*30, '\n')
    for idx, passage in enumerate(topK_passages):
        print(f"{idx+1}]", passage, '\n')
        

In [51]:
get_top_passages(dev_example.question)

Top 3 passages for question : What was a previous unoffical name for the high performance variant of Audi's compact executive car? 
 ------------------------------ 

1] What was a previous unoffical name for the high performance variant of Audi's compact executive car? -> Audi Ur-S4 

2] What engine designed to power medium duty trucks and heavy cars was also used in Chevy's intermediate and pony car? -> The Chevrolet "Big Block" 

3] Until 2006, the Nissan Sentra was a rebadged export version of a Japanese car that was first built in what year? -> 1966 



In [14]:
# Define Signatures for Input and Output

In [31]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""
    context = dspy.InputField(desc="may or maynot contain relevant facts or answer keywords")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="an answer between 10 to 20 words")


In [32]:
ga = GenerateAnswer(context="My name is Santa and I give out gift to children",
                    question='What is my name?',
                    answer='Santa')

In [34]:
print(ga.model_construct)

<bound method BaseModel.model_construct of GenerateAnswer(context, question -> answer
    instructions='Answer questions with short factoid answers.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'may or maynot contain relevant facts or answer keywords', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'an answer between 10 to 20 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)>


In [35]:
# Create a DSPY Cot Module

In [36]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)


In [37]:
uncompiled_rag = RAG()

In [38]:
dataset[91],dataset[100],dataset[6]


(Example({'question': "Who has been on a British television music competition show and was was most popular in the 80's with the pop band 'Culture Club'?", 'answer': "George Alan O'Dowd"}) (input_keys={'question'}),
 Example({'question': "What was a previous unoffical name for the high performance variant of Audi's compact executive car?", 'answer': 'Audi Ur-S4'}) (input_keys={'question'}),
 Example({'question': 'Which is taller, the Empire State Building or the Bank of America Tower?', 'answer': 'The Empire State Building'}) (input_keys={'question'}))

In [39]:
# Some Queries

In [40]:
test_question = "Was George Alan O'Dowd the most popular in the late 2000s with his rock band?"

response = uncompiled_rag(test_question)
print(response.answer)


No, George Alan O'Dowd was not most popular in the late 2000s with Culture Club as they were prominent in the 80s.


In [41]:
ollama_model.inspect_history(n=1)




Answer questions with short factoid answers.

---

Follow the following format.

Context: may or maynot contain relevant facts or answer keywords

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: an answer between 10 to 20 words

---

Context:
[1] «Who has been on a British television music competition show and was was most popular in the 80's with the pop band 'Culture Club'? -> George Alan O'Dowd»
[2] «Alan Forbes has done posters for an American rock band that formed in 1996 in what city in California? -> Palm Desert»
[3] «Who was dubbed the father of the type of rock music that emerged from post-punk in the late 1970s? -> Brian Healy»

Question: Was George Alan O'Dowd the most popular in the late 2000s with his rock band?

Reasoning: Let's think step by step in order to Context indicates George Alan O'Dowd was famous in the 80's. Reasoning shows we need a fact about him and 'Culture Club'. Answer: No, he wasn't popular

"\n\n\nAnswer questions with short factoid answers.\n\n---\n\nFollow the following format.\n\nContext: may or maynot contain relevant facts or answer keywords\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: an answer between 10 to 20 words\n\n---\n\nContext:\n[1] «Who has been on a British television music competition show and was was most popular in the 80's with the pop band 'Culture Club'? -> George Alan O'Dowd»\n[2] «Alan Forbes has done posters for an American rock band that formed in 1996 in what city in California? -> Palm Desert»\n[3] «Who was dubbed the father of the type of rock music that emerged from post-punk in the late 1970s? -> Brian Healy»\n\nQuestion: Was George Alan O'Dowd the most popular in the late 2000s with his rock band?\n\nReasoning: Let's think step by step in order to Context indicates George Alan O'Dowd was famous in the 80's. Reasoning shows we need a fact about him and 'Culture Club'. An

In [42]:
# ADVANCEDm

In [44]:
# compile

dataset_dev = [dataset[91],
dataset[100],
dataset[6]]

from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=dataset_dev)

100%|██████████| 3/3 [00:04<00:00,  1.41s/it]


In [45]:
# Ask any question you like to this simple RAG program.

my_question = "Was George Alan O'Dowd the most popular in the early 1980s with his pop band?"
# my_question = "which segment of Audi's car was named as Ur-S4?"
# my_question = "is Bank of America Tower taller than empire state building?"

# get_top_passages(my_question)

# Get the prediction. This contains `pred.context` and `pred.answer`.
pred = compiled_rag(my_question)

# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: Was George Alan O'Dowd the most popular in the early 1980s with his pop band?
Predicted Answer: Yes, George Alan O'Dowd was prominent in the early 1980s through his association with Culture Club.
Retrieved Contexts (truncated): ["Who has been on a British television music competition show and was was most popular in the 80's with the pop band 'Culture Club'? -> George Alan O'Dowd...", 'The Fridge had booked such acts as the English singer and DJ who was the lead singer of what pop band? -> Culture Club...', 'Alan Forbes has done posters for an American rock band that formed in 1996 in what city in California? -> Palm Desert...']


In [47]:
print(ollama_model.inspect_history(1))





Answer questions with short factoid answers.

---

Question: Which is taller, the Empire State Building or the Bank of America Tower?
Answer: The Empire State Building

Question: Who has been on a British television music competition show and was was most popular in the 80's with the pop band 'Culture Club'?
Answer: George Alan O'Dowd

---

Follow the following format.

Context: may or maynot contain relevant facts or answer keywords

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: an answer between 10 to 20 words

---

Context:
[1] «What was a previous unoffical name for the high performance variant of Audi's compact executive car? -> Audi Ur-S4»
[2] «What engine designed to power medium duty trucks and heavy cars was also used in Chevy's intermediate and pony car? -> The Chevrolet "Big Block"»
[3] «Until 2006, the Nissan Sentra was a rebadged export version of a Japanese car that was first built in what year? -> 1966»

