In [1]:
# > https://www.youtube.com/watch?v=41EfOY0Ldkc

In [2]:
# Set open ai api key
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")

In [3]:
import dspy

In [4]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo', api_key=OPEN_AI_API_KEY)
colbertv2_wiki17_abstract = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstract)

In [5]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

README.md:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

hotpot_qa.py:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/566M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

(20, 50)

In [6]:
train_example = trainset[1]
print(f"Question: {train_example.question}")
print(f"Answer: {train_example.answer}")

Question: which  American actor was Candace Kita  guest starred with 
Answer: Bill Murray


In [7]:
dev_example = devset[18]
print(f"Question: {dev_example.question}")
print(f"Answer: {dev_example.answer}")
print(f"Relevant Wikipedia Titles: {dev_example.gold_titles}")

Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
Answer: English
Relevant Wikipedia Titles: {'Robert Irvine', 'Restaurant: Impossible'}


In [8]:
print(f"For this dataset, training examples have input keys {train_example.inputs().keys()}")
print(f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and labels")

For this dataset, training examples have input keys ['question']
For this dataset, dev examples have input keys ['question'] and labels


In [9]:
class BasicQA(dspy.Signature):
    """Answers with short factoid answers."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [11]:
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)

# Call the predictor on a particular label
pred = generate_answer(question=dev_example.question)

# Print the input and the prediction.
print(f"Question: {dev_example.question}")
print(f"Answer: {pred.answer}")

 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb


Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
Answer: American


In [12]:
turbo.inspect_history(n=1)




Answers with short factoid answers.

---

Follow the following format.

Question: ${question}
Answer: often between 1 and 5 words

---

Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
Answer:[32m American[0m





'\n\n\nAnswers with short factoid answers.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\nAnswer: often between 1 and 5 words\n\n---\n\nQuestion: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?\nAnswer:\x1b[32m American\x1b[0m\n\n\n'

In [13]:
# Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged
generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)

# Call the predictor on the same input.
pred = generate_answer_with_chain_of_thought(question=dev_example.question)

# Print the input, the chain of thought, and the prediction.
print(f"Question: {dev_example.question}")
print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
print(f"Predicted Answer: {pred.answer}")

Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
Thought: We know that the chef and restaurateur featured in Restaurant: Impossible is British.
Predicted Answer: British


In [14]:
turbo.inspect_history(n=1)




Answers with short factoid answers.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: often between 1 and 5 words

---

Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
Reasoning: Let's think step by step in order to[32m produce the answer. We know that the chef and restaurateur featured in Restaurant: Impossible is British.
Answer: British[0m





"\n\n\nAnswers with short factoid answers.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\nAnswer: often between 1 and 5 words\n\n---\n\nQuestion: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?\nReasoning: Let's think step by step in order to\x1b[32m produce the answer. We know that the chef and restaurateur featured in Restaurant: Impossible is British.\nAnswer: British\x1b[0m\n\n\n"

In [16]:
retrieve = dspy.Retrieve(k=3)
topK_passages = retrieve(dev_example.question).passages

print(f"Top {retrieve.k} passages for question: {dev_example.question} \n", "-" * 30, '\n')

for idx, passage in enumerate(topK_passages):
    print(f"{idx+1}", passage, '\n')

Top 3 passages for question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible? 
 ------------------------------ 

1 Restaurant: Impossible | Restaurant: Impossible is an American reality television series, featuring chef and restaurateur Robert Irvine, that aired on Food Network from 2011 to 2016. 

2 Jean Joho | Jean Joho is a French-American chef and restaurateur. He is chef/proprietor of Everest in Chicago (founded in 1986), Paris Club Bistro & Bar and Studio Paris in Chicago, The Eiffel Tower Restaurant in Las Vegas, and Brasserie JO in Boston. 

3 List of Restaurant: Impossible episodes | This is the list of the episodes for the American cooking and reality television series "Restaurant Impossible", produced by Food Network. The premise of the series is that within two days and on a budget of $10,000, celebrity chef Robert Irvine renovates a failing American restaurant with the goal of helping to restore it to profitability and prominence. Ir

In [17]:
retrieve("When was the first FIFA world cup held?").passages[0]

'History of the FIFA World Cup | The FIFA World Cup was first held in 1930, when FIFA president Jules Rimet decided to stage an international football tournament. The inaugural edition, held in 1930, was contested as a final tournament of only thirteen teams invited by the organization. Since then, the World Cup has experienced successive expansions and format remodeling to its current 32-team final tournament preceded by a two-year qualifying process, involving over 200 teams from around the world.'

In [18]:
class GenerateAnswer(dspy.Signature):
    """Answers with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [19]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought("question, context -> answer")

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [20]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_FM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_FM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

 60%|██████    | 12/20 [00:09<00:06,  1.24it/s]

Bootstrapped 4 full traces after 12 examples for up to 1 rounds, amounting to 12 attempts.





In [21]:
# Ask any question you like to this simple RAG program.
my_question = "What castle did David Gregory inherit?"

# Get the prediction. This contains 'pred.context' and 'pred.'
pred = compiled_rag(my_question)

# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: What castle did David Gregory inherit?
Predicted Answer: Kinnairdy Castle
Retrieved Contexts (truncated): ['David Gregory (physician) | David Gregory (20 December 1625 – 1720) was a Scottish physician and inventor. His surname is sometimes spelt as Gregorie, the original Scottish spelling. He inherited Kinn...', 'Gregory Tarchaneiotes | Gregory Tarchaneiotes (Greek: Γρηγόριος Ταρχανειώτης , Italian: "Gregorio Tracanioto" or "Tracamoto" ) was a "protospatharius" and the long-reigning catepan of Italy from 998 t...', 'David Gregory (mathematician) | David Gregory (originally spelt Gregorie) FRS (? 1659 – 10 October 1708) was a Scottish mathematician and astronomer. He was professor of mathematics at the University ...']


In [22]:
turbo.inspect_history(n=1)




Given the fields `question`, `context`, produce the fields `answer`.

---

Question: At My Window was released by which American singer-songwriter?
Answer: John Townes Van Zandt

Question: "Everything Has Changed" is a song from an album released under which record label ?
Answer: Big Machine Records

Question: The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?
Answer: 1950

Question: Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?
Answer: Aleem Sarwar Dar

Question: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?
Answer: "Outfield of Dreams"

Question: Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?
Answer: Aleksandr Danilovich Aleksandrov

Question: The Organisation that allows a community to influence their operation or use and

'\n\n\nGiven the fields `question`, `context`, produce the fields `answer`.\n\n---\n\nQuestion: At My Window was released by which American singer-songwriter?\nAnswer: John Townes Van Zandt\n\nQuestion: "Everything Has Changed" is a song from an album released under which record label ?\nAnswer: Big Machine Records\n\nQuestion: The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?\nAnswer: 1950\n\nQuestion: Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?\nAnswer: Aleem Sarwar Dar\n\nQuestion: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?\nAnswer: "Outfield of Dreams"\n\nQuestion: Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?\nAnswer: Aleksandr Danilovich Aleksandrov\n\nQuestion: The Organisation that allows a community to influence 

In [23]:
for name, parameter in compiled_rag.named_predictors():
    print(name)
    print(parameter.demos[0])
    print()

generate_answer
Example({'augmented': True, 'context': ['Tae Kwon Do Times | Tae Kwon Do Times is a magazine devoted to the martial art of taekwondo, and is published in the United States of America. While the title suggests that it focuses on taekwondo exclusively, the magazine also covers other Korean martial arts. "Tae Kwon Do Times" has published articles by a wide range of authors, including He-Young Kimm, Thomas Kurz, Scott Shaw, and Mark Van Schuyver.', "Kwon Tae-man | Kwon Tae-man (born 1941) was an early Korean hapkido practitioner and a pioneer of the art, first in Korea and then in the United States. He formed one of the earliest dojang's for hapkido in the United States in Torrance, California, and has been featured in many magazine articles promoting the art.", 'Hee Il Cho | Cho Hee Il (born October 13, 1940) is a prominent Korean-American master of taekwondo, holding the rank of 9th "dan" in the martial art. He has written 11 martial art books, produced 70 martial art tra

In [24]:
from dspy.evaluate.evaluate import Evaluate

# Set up the 'evaluate_on_hotpotqa' function. We'll use this many
evaluate_on_hotpotqa = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=True)

# Evaluate the 'compiled_rag' program with the 'answer_exact_match' mettric.
metric = dspy.evaluate.answer_exact_match
evaluate_on_hotpotqa(compiled_rag, metric=metric)

Average Metric: 21.00 / 50 (42.0%): 100%|██████████| 50/50 [00:44<00:00,  1.13it/s]

2025/01/18 23:01:03 INFO dspy.evaluate.evaluate: Average Metric: 21 / 50 (42.0%)





Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,answer_exact_match
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{Qionghai, Cangzhou}",['Cangzhou | Cangzhou () is a prefecture-level city in eastern Heb...,"No, only Cangzhou is in the Hebei province of China.",
1,Who conducts the draft in which Marc-Andre Fleury was drafted to t...,National Hockey League,"{2017–18 Pittsburgh Penguins season, 2017 NHL Expansion Draft}",['2017–18 Pittsburgh Penguins season | The 2017–18 Pittsburgh Peng...,National Hockey League,✔️ [True]
2,"The Wings entered a new era, following the retirement of which Can...",Steve Yzerman,"{2006–07 Detroit Red Wings season, Steve Yzerman}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, ...",Steve Yzerman,✔️ [True]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{Crichton Collegiate Church, Crichton Castle}","[""Crichton Collegiate Church | Crichton Collegiate Church is situa...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by ...,King Alfred the Great,"{Ealhswith, Æthelweard (son of Alfred)}","[""Æthelweard of East Anglia | Æthelweard (died 854) was a 9th-cent...",Alfred the Great,
5,The Newark Airport Exchange is at the northern edge of an airport ...,Port Authority of New York and New Jersey,"{Newark Airport Interchange, Newark Liberty International Airport}",['Newark Airport Interchange | The Newark Airport Interchange is a...,Port Authority of New York and New Jersey,✔️ [True]
6,Where did an event take place resulting in a win during a domestic...,Bundesliga,"{Claudio Pizarro, 2005–06 FC Bayern Munich season}",['List of Peru international footballers | Peru took part in the i...,Maracanã Stadium,
7,Are both Chico Municipal Airport and William R. Fairchild Internat...,no,"{Chico Municipal Airport, William R. Fairchild International Airport}",['William R. Fairchild International Airport | William R. Fairchil...,"No, only Chico Municipal Airport is in California.",
8,In which Maine county is Fort Pownall located?,"Waldo County, Maine","{Stockton Springs, Maine, Fort Pownall}","[""Fort Pownall | Fort Pownall was a British fortification built du...",Waldo County,
9,"Which 90s rock band has more recently reformed, Gene or The Afghan...",The Afghan Whigs,"{Gene (band), The Afghan Whigs}",['The Afghan Whigs | The Afghan Whigs are an American rock band fr...,The Afghan Whigs,✔️ [True]


42.0

In [25]:
def gold_passages_retrieved(example, pred, trace=None):
    gold_titles = set(map(dspy.evaluate.normalize_text, example['gold_titles']))
    found_titles = set(map(dspy.evaluate.normalize_text, [c.split(' | ')[0] for c in pred.context]))

    return gold_titles.issubset(found_titles)

compiled_rag_retrieval_score = evaluate_on_hotpotqa(compiled_rag, metric=gold_passages_retrieved)

Average Metric: 13.00 / 50 (26.0%): 100%|██████████| 50/50 [00:00<00:00, 1301.04it/s]

2025/01/18 23:02:06 INFO dspy.evaluate.evaluate: Average Metric: 13 / 50 (26.0%)





Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,gold_passages_retrieved
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{Qionghai, Cangzhou}",['Cangzhou | Cangzhou () is a prefecture-level city in eastern Heb...,"No, only Cangzhou is in the Hebei province of China.",
1,Who conducts the draft in which Marc-Andre Fleury was drafted to t...,National Hockey League,"{2017–18 Pittsburgh Penguins season, 2017 NHL Expansion Draft}",['2017–18 Pittsburgh Penguins season | The 2017–18 Pittsburgh Peng...,National Hockey League,✔️ [True]
2,"The Wings entered a new era, following the retirement of which Can...",Steve Yzerman,"{2006–07 Detroit Red Wings season, Steve Yzerman}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, ...",Steve Yzerman,✔️ [True]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{Crichton Collegiate Church, Crichton Castle}","[""Crichton Collegiate Church | Crichton Collegiate Church is situa...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by ...,King Alfred the Great,"{Ealhswith, Æthelweard (son of Alfred)}","[""Æthelweard of East Anglia | Æthelweard (died 854) was a 9th-cent...",Alfred the Great,
5,The Newark Airport Exchange is at the northern edge of an airport ...,Port Authority of New York and New Jersey,"{Newark Airport Interchange, Newark Liberty International Airport}",['Newark Airport Interchange | The Newark Airport Interchange is a...,Port Authority of New York and New Jersey,✔️ [True]
6,Where did an event take place resulting in a win during a domestic...,Bundesliga,"{Claudio Pizarro, 2005–06 FC Bayern Munich season}",['List of Peru international footballers | Peru took part in the i...,Maracanã Stadium,
7,Are both Chico Municipal Airport and William R. Fairchild Internat...,no,"{Chico Municipal Airport, William R. Fairchild International Airport}",['William R. Fairchild International Airport | William R. Fairchil...,"No, only Chico Municipal Airport is in California.",✔️ [True]
8,In which Maine county is Fort Pownall located?,"Waldo County, Maine","{Stockton Springs, Maine, Fort Pownall}","[""Fort Pownall | Fort Pownall was a British fortification built du...",Waldo County,
9,"Which 90s rock band has more recently reformed, Gene or The Afghan...",The Afghan Whigs,"{Gene (band), The Afghan Whigs}",['The Afghan Whigs | The Afghan Whigs are an American rock band fr...,The Afghan Whigs,


In [26]:
# Multi-Hop Search

In [46]:
class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    # answer = dspy.OutputField()
    query = dspy.OutputField(desc="a query to search for the answer")

In [47]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()

        # self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.generate_query = dspy.ChainOfThought(GenerateSearchQuery)
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops

    def forward(self, question):
        context = []

        for hop in range(self.max_hops):
            query = self.generate_query(context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [45]:
# Ask any question you like to this simple RAG program.
my_question = "How many storeys are in the castle that David Gregory inherited?"

# Get the prediction. This contains 'pred.context' and 'pred.'
uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e. zero-shot) program
pred = uncompiled_baleen(my_question)

# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '~~~' for c in pred.context]}")

"David Gregory castle inherited number of storeys"
"Kinnairdy Castle description"
Question: How many storeys are in the castle that David Gregory inherited?
Predicted Answer: five
Retrieved Contexts (truncated): ['David Gregory (physician) | David Gregory (20 December 1625 – 1720) was a Scottish physician and inventor. His surname is sometimes spelt as Gregorie, the original Scottish spelling. He inherited Kinn~~~', 'David Gregory (footballer, born 1970) | Born in Polstead, Gregory began his career at Ipswich Town, making 32 appearances between 1987–1995. He made two appearances on loan at Hereford United and thre~~~', 'Roughan Castle | Roughan Castle is a castle a mile outside Newmills, County Tyrone, Northern Ireland, on the Dungannon to Stewartstown road. It was built about 1618 by Sir Andrew Stewart (d.1639), 2nd~~~', 'Kinnairdy Castle | Kinnairdy Castle is a tower house, having five storeys and a garret, two miles south of Aberchirder, Aberdeenshire, Scotland. The alternative name

In [48]:
turbo.inspect_history(n=3)




Write a simple search query that will help answer a complex question.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the query}. We ...

Query: a query to search for the answer

---

Context: N/A

Question: How many storeys are in the castle that David Gregory inherited?

Reasoning: Let's think step by step in order to[32m produce the query. We know that David Gregory inherited a castle, and we are specifically interested in the number of storeys it has. To find this information, we need to search for details about the castle's architecture.

Query: "David Gregory castle inherited number of storeys"[0m





Write a simple search query that will help answer a complex question.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the query}. We ...

Query: a query to search for 

'\n\n\nWrite a simple search query that will help answer a complex question.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produce the query}. We ...\n\nQuery: a query to search for the answer\n\n---\n\nContext: N/A\n\nQuestion: How many storeys are in the castle that David Gregory inherited?\n\nReasoning: Let\'s think step by step in order to\x1b[32m produce the query. We know that David Gregory inherited a castle, and we are specifically interested in the number of storeys it has. To find this information, we need to search for details about the castle\'s architecture.\n\nQuery: "David Gregory castle inherited number of storeys"\x1b[0m\n\n\n\n\n\nWrite a simple search query that will help answer a complex question.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produ

In [49]:
def validate_context_and_answer_and_hops(example, pred, trace=None):
    if not dspy.evaluate.answer_exact_match(example, pred): return False
    if not dspy.evaluate.answer_passage_match(example, pred): return False

    hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]

    if max([len(h) for h in hops]) > 100: return False
    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(hops)): return False

    return True

In [50]:
teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
compiled_baleen = teleprompter.compile(SimplifiedBaleen(), teacher=SimplifiedBaleen(passages_per_hop=2), trainset=trainset)

 25%|██▌       | 5/20 [00:12<00:37,  2.51s/it]2025/01/18 23:12:12 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'question': 'In what year was the club founded that played Manchester City in the 1972 FA Charity Shield', 'answer': '1874'}) (input_keys={'question'}) with <function validate_context_and_answer_and_hops at 0x32d9cfe20> due to 'list' object cannot be interpreted as an integer.
 30%|███       | 6/20 [00:14<00:34,  2.45s/it]2025/01/18 23:12:15 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'question': 'Which is taller, the Empire State Building or the Bank of America Tower?', 'answer': 'The Empire State Building'}) (input_keys={'question'}) with <function validate_context_and_answer_and_hops at 0x32d9cfe20> due to 'list' object cannot be interpreted as an integer.
 85%|████████▌ | 17/20 [00:44<00:07,  2.51s/it]2025/01/18 23:12:44 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'quest

Bootstrapped 0 full traces after 19 examples for up to 1 rounds, amounting to 20 attempts.





In [51]:
uncompiled_baleen_retrieval_score = evaluate_on_hotpotqa(uncompiled_baleen, metric=gold_passages_retrieved)

  0%|          | 0/50 [00:00<?, ?it/s]"Location of Cangzhou China" AND "Location of Qionghai China"
"Location of Qionghai China"
Average Metric: 1.00 / 50 (2.0%):   2%|▏         | 1/50 [00:02<02:21,  2.89s/it]"2017 NHL Expansion Draft Vegas Golden Knights"
"2017 NHL Expansion Draft Marc-Andre Fleury draft conductor"
Average Metric: 2.00 / 50 (4.0%):   4%|▍         | 2/50 [00:05<02:00,  2.52s/it]"Canadian retired professional ice hockey player current general manager Tampa Bay Lightning"
"Retired Canadian ice hockey player and current general manager of the Tampa Bay Lightning" -Steve Yzerman
Average Metric: 2.00 / 50 (4.0%):   6%|▌         | 3/50 [00:08<02:06,  2.70s/it]"rivers near Crichton Collegiate Church"
River near Crichton Castle Scotland
Average Metric: 3.00 / 50 (6.0%):   8%|▊         | 4/50 [00:10<02:01,  2.65s/it]"English kings in the 10th Century A.D."
"Lineage of King Alfred the Great of Wessex"
Average Metric: 3.00 / 50 (6.0%):  10%|█         | 5/50 [00:12<01:54,  2.54s/i

2025/01/18 23:15:47 INFO dspy.evaluate.evaluate: Average Metric: 30 / 50 (60.0%)





Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,gold_passages_retrieved
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{Qionghai, Cangzhou}",['Cangzhou | Cangzhou () is a prefecture-level city in eastern Heb...,No,✔️ [True]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to t...,National Hockey League,"{2017–18 Pittsburgh Penguins season, 2017 NHL Expansion Draft}","[""2017 NHL Expansion Draft | The 2017 NHL Expansion Draft was an e...",National Hockey League,✔️ [True]
2,"The Wings entered a new era, following the retirement of which Can...",Steve Yzerman,"{2006–07 Detroit Red Wings season, Steve Yzerman}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, ...",Steve Yzerman,
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{Crichton Collegiate Church, Crichton Castle}","[""Crichton Collegiate Church | Crichton Collegiate Church is situa...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by ...,King Alfred the Great,"{Ealhswith, Æthelweard (son of Alfred)}",['10th century in England | Events from the 10th century in the Ki...,Alfred the Great,
5,The Newark Airport Exchange is at the northern edge of an airport ...,Port Authority of New York and New Jersey,"{Newark Airport Interchange, Newark Liberty International Airport}",['Newark Liberty International Airport | Newark Liberty Internatio...,Port Authority of New York and New Jersey,✔️ [True]
6,Where did an event take place resulting in a win during a domestic...,Bundesliga,"{Claudio Pizarro, 2005–06 FC Bayern Munich season}",['2015 South American Championships in Athletics | The 2015 South ...,Videna Stadium,
7,Are both Chico Municipal Airport and William R. Fairchild Internat...,no,"{Chico Municipal Airport, William R. Fairchild International Airport}",['William R. Fairchild International Airport | William R. Fairchil...,No.,✔️ [True]
8,In which Maine county is Fort Pownall located?,"Waldo County, Maine","{Stockton Springs, Maine, Fort Pownall}","[""Fort Pownall | Fort Pownall was a British fortification built du...",Waldo County,✔️ [True]
9,"Which 90s rock band has more recently reformed, Gene or The Afghan...",The Afghan Whigs,"{Gene (band), The Afghan Whigs}",['Unbreakable: A Retrospective 1990–2006 | Unbreakable: A Retrospe...,The Afghan Whigs,✔️ [True]


In [52]:
compiled_baleen_retrieval_score = evaluate_on_hotpotqa(compiled_baleen, metric=gold_passages_retrieved)

Average Metric: 30.00 / 50 (60.0%): 100%|██████████| 50/50 [00:35<00:00,  1.42it/s]

2025/01/18 23:16:23 INFO dspy.evaluate.evaluate: Average Metric: 30 / 50 (60.0%)





Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,gold_passages_retrieved
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{Qionghai, Cangzhou}",['Cangzhou | Cangzhou () is a prefecture-level city in eastern Heb...,No,✔️ [True]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to t...,National Hockey League,"{2017–18 Pittsburgh Penguins season, 2017 NHL Expansion Draft}","[""2017 NHL Expansion Draft | The 2017 NHL Expansion Draft was an e...",National Hockey League,✔️ [True]
2,"The Wings entered a new era, following the retirement of which Can...",Steve Yzerman,"{2006–07 Detroit Red Wings season, Steve Yzerman}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, ...",Steve Yzerman,
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{Crichton Collegiate Church, Crichton Castle}","[""Crichton Collegiate Church | Crichton Collegiate Church is situa...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by ...,King Alfred the Great,"{Ealhswith, Æthelweard (son of Alfred)}",['10th century in England | Events from the 10th century in the Ki...,Alfred the Great,
5,The Newark Airport Exchange is at the northern edge of an airport ...,Port Authority of New York and New Jersey,"{Newark Airport Interchange, Newark Liberty International Airport}",['Newark Liberty International Airport | Newark Liberty Internatio...,Port Authority of New York and New Jersey,✔️ [True]
6,Where did an event take place resulting in a win during a domestic...,Bundesliga,"{Claudio Pizarro, 2005–06 FC Bayern Munich season}",['2015 South American Championships in Athletics | The 2015 South ...,2015 Torneo del Inca,
7,Are both Chico Municipal Airport and William R. Fairchild Internat...,no,"{Chico Municipal Airport, William R. Fairchild International Airport}",['William R. Fairchild International Airport | William R. Fairchil...,No,✔️ [True]
8,In which Maine county is Fort Pownall located?,"Waldo County, Maine","{Stockton Springs, Maine, Fort Pownall}","[""Fort Pownall | Fort Pownall was a British fortification built du...",Waldo County,✔️ [True]
9,"Which 90s rock band has more recently reformed, Gene or The Afghan...",The Afghan Whigs,"{Gene (band), The Afghan Whigs}",['Unbreakable: A Retrospective 1990–2006 | Unbreakable: A Retrospe...,The Afghan Whigs,✔️ [True]


In [53]:
print(f"## Retrieval Score for RAG: {compiled_baleen_retrieval_score}")  # note that for RAG, c
print(f"## Retrieval Score for uncompiled Baleen: {uncompiled_baleen_retrieval_score}")
print(f"## Retrieval Score for compiled Baleen: {compiled_baleen_retrieval_score}")

## Retrieval Score for RAG: 60.0
## Retrieval Score for uncompiled Baleen: 60.0
## Retrieval Score for compiled Baleen: 60.0
