In [6]:
import dspy
from decouple import config
from typing import Literal, List

OPENROUTER_API_KEY = config("OPENROUTER_API_KEY")

In [7]:
lm = dspy.LM("openrouter/qwen/qwen-2.5-72b-instruct", api_key=OPENROUTER_API_KEY, api_base="https://openrouter.ai/api/v1",)
dspy.configure(lm=lm)

In [8]:
class Answer(dspy.Signature):
    """Answer a question based on the provided documents."""

    question: str = dspy.InputField()
    context: str = dspy.InputField()
    correct_answers: List[str] = dspy.OutputField()
    explanation: str = dspy.OutputField()

In [9]:
answer = dspy.Predict(Answer)

### Processing data into dspy examples

In [10]:
import ujson

file_path = "/home/darth/Documents/code/ramdocs/RAMDocs/src/data/split_data/RAMDocs_test_train.jsonl"

data = []
with open(file_path) as f:
    for line in f:
        line = ujson.loads(line)
        line_data = {
            'question': line['question'],
            'context': "\n".join(["Document " + str(i+1) + ": " + doc['text'] for i, doc in enumerate(line['documents'])]),
            "documents": line['documents'],
            "disambig_entity": line['disambig_entity'],
            "gold_answers": line['gold_answers'],
            "wrong_answers": line['wrong_answers']
        }
        data.append(line_data)
    # data = [ujson.loads(line) for line in f]

In [11]:
data[0]

{'question': 'What is the profession of C. Mayer?',
 'context': 'Document 1: Christa Mayer Christa Mayer is a German operatic mezzo-soprano. She is particularly known for her portrayal of Erda in Richard Wagner\'s "Ring Cycle"; a role which she has performed several times at the Bayreuth Festival and recorded on the BBC Legends Record Label. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from the Bavarian Academy of Singing, she pursued further studies at the Munich Academy of Music where she was a pupil of tenor Thomas Moser. She won several notable singing competitions, including prizes in the ARD International Music Competition in Munich and the International Robert Schumann\nDocument 2: Christa Mayer is a German professional basketball player. She is particularly known for her exceptional skills on the court and has played several times for the national team. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from 

In [12]:
data = [dspy.Example(**d).with_inputs('question', 'context') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [13]:
import random

random.Random(0).shuffle(data)
trainset, devset = data[:200], data[200:1000]

len(trainset), len(devset)

(200, 50)

### Baseline evaluation

In [14]:
pred = answer(**example.inputs())

In [15]:
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [16]:
pred

Prediction(
    correct_answers=['1,234'],
    explanation='The population of Sandusky Township, Crawford County, Ohio, as of the 2020 census, was 1,234. This information is provided in Document 2 and Document 3, with Document 3 confirming the population figure from the 2010 census as well.'
)

In [17]:
import unicodedata
import re
import string

def normalize_text(s):
    s = unicodedata.normalize("NFD", s)

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def evaluation_metric(example, pred):
    correct_count = 0
    llm_answers = [normalize_text(answer) for answer in pred.correct_answers]
    gold_answers = [normalize_text(answer) for answer in example.gold_answers]
    wrong_answers = [normalize_text(answer) for answer in example.wrong_answers]
    
    # Check if llm_answers contains all gold answers and no wrong answers
    all_gold_included = all(gold in llm_answers for gold in gold_answers)
    no_wrong_included = all(wrong not in llm_answers for wrong in wrong_answers)
    
    if all_gold_included and no_wrong_included:
        score = 1
    else:
        score = 0
    return score

evaluation_metric(example, pred)

0

In [18]:
evaluate = dspy.Evaluate(devset=devset, metric=evaluation_metric, num_threads=24,
                         display_progress=True, display_table=2)

evaluate(answer)

Average Metric: 8.00 / 50 (16.0%): 100%|██████████| 50/50 [00:18<00:00,  2.75it/s]

2025/06/29 22:09:46 INFO dspy.evaluate.evaluate: Average Metric: 8 / 50 (16.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,correct_answers,explanation,evaluation_metric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]","[Boys, Girls]",The gender composition of Wirral Grammar School includes both boys...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"[December 14, 1846]","The correct answer is December 14, 1846, as stated in Document 1, ...",


16.0

### Optimizer

In [19]:
tp = dspy.MIPROv2(metric=evaluation_metric, auto="medium", num_threads=24)  # use fewer threads if your rate limit is small

optimized_answer = tp.compile(answer, trainset=trainset,
                           max_bootstrapped_demos=2, max_labeled_demos=2,
                           requires_permission_to_run=False)

2025/06/29 22:36:57 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: True
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 160

2025/06/29 22:36:57 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/06/29 22:36:57 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/06/29 22:36:57 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


  0%|          | 0/40 [00:00<?, ?it/s]2025/06/29 22:37:01 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'question': 'Who directed the film "The Charge of the Light Brigade"?', 'context': 'Document 1: The Charge of the Light Brigade (1912 film) The Charge of the Light Brigade is a 1912 American short war film directed by J. Searle Dawley and starring James Gordon, Richard Neill and Charles Sutton. It portrays the Charge of the Light Brigade when a brigade of British light cavalry charged entrenched batteries of Russian artillery at the Battle of Balaklava in 1854 during the Crimean War. The plot follows that of Tennyson\'s poem "The Charge of the Light Brigade". The film was made by Edison Studios and shot in Wyoming using 800 American cavalry troopers to play the parts\nDocument 2: Lord Tennyson\'s poem was the basis for a 1903 Biograph film and a 1912 Edison film directed by Steven Spielberg and starring Ben Wilson and Richard Neill. In 1968, director 

Error getting source code: unhashable type: 'dict'.

Running without program aware proposer.


2025/06/29 22:38:47 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...

2025/06/29 22:39:02 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/06/29 22:39:02 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Answer a question based on the provided documents.

2025/06/29 22:39:02 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given the provided documents, answer the question by carefully considering the context and details. Ensure your response is factually accurate and clearly disambiguated if necessary.

2025/06/29 22:39:02 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Given the provided documents, answer the question accurately, taking into account any disambiguation required. Ensure your answer is based on the factual information presented in the documents and avoid including any incorrect details.

2025/06/29 22:39:02 INFO dspy.teleprompt.mipro_optimizer_v2: 3: You are a historical fact checker for a prestigious academic journal. Your ta

Average Metric: 27.00 / 160 (16.9%): 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]

2025/06/29 22:39:35 INFO dspy.evaluate.evaluate: Average Metric: 27 / 160 (16.9%)
2025/06/29 22:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 16.88






2025/06/29 22:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 23 - Minibatch ==


Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:11<00:00,  3.03it/s]

2025/06/29 22:39:49 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/06/29 22:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/06/29 22:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29]
2025/06/29 22:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88]
2025/06/29 22:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 23 - Minibatch ==



Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:11<00:00,  3.10it/s]

2025/06/29 22:40:00 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5'].
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57]
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88]
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:00<00:00, 1359.93it/s]

2025/06/29 22:40:00 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0'].
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0]
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88]
2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:09<00:00,  3.81it/s]

2025/06/29 22:40:10 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/29 22:40:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4'].
2025/06/29 22:40:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0]
2025/06/29 22:40:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88]
2025/06/29 22:40:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:08<00:00,  4.29it/s]

2025/06/29 22:40:18 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2'].
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0]
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88]
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 23 - Full Evaluation =====
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 20.0) from minibatch trials...



Average Metric: 27.00 / 160 (16.9%): 100%|██████████| 160/160 [00:00<00:00, 1493.68it/s]

2025/06/29 22:40:18 INFO dspy.evaluate.evaluate: Average Metric: 27 / 160 (16.9%)
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88]
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88
2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/29 22:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 23 - Minibatch ==



Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:08<00:00,  4.28it/s]

2025/06/29 22:40:27 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2'].
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29]
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88]
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:00<00:00, 1656.01it/s]

2025/06/29 22:40:27 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0'].
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0]
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88]
2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 23 - Minibatch ==



Average Metric: 8.00 / 35 (22.9%): 100%|██████████| 35/35 [00:06<00:00,  5.38it/s]

2025/06/29 22:40:33 INFO dspy.evaluate.evaluate: Average Metric: 8 / 35 (22.9%)
2025/06/29 22:40:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2'].
2025/06/29 22:40:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86]
2025/06/29 22:40:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88]
2025/06/29 22:40:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:14<00:00,  2.48it/s]

2025/06/29 22:40:48 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/06/29 22:40:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3'].
2025/06/29 22:40:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14]
2025/06/29 22:40:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88]
2025/06/29 22:40:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 23 - Minibatch ==



Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:07<00:00,  4.58it/s]  

2025/06/29 22:40:56 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/06/29 22:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2'].
2025/06/29 22:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57]
2025/06/29 22:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88]
2025/06/29 22:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.88


2025/06/29 22:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 23 - Full Evaluation =====
2025/06/29 22:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 20.0) from minibatch trials...



Average Metric: 30.00 / 160 (18.8%): 100%|██████████| 160/160 [00:25<00:00,  6.35it/s]

2025/06/29 22:41:21 INFO dspy.evaluate.evaluate: Average Metric: 30 / 160 (18.8%)
2025/06/29 22:41:21 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 18.75
2025/06/29 22:41:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75]
2025/06/29 22:41:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75
2025/06/29 22:41:21 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/29 22:41:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:12<00:00,  2.86it/s]

2025/06/29 22:41:33 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/06/29 22:41:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3'].
2025/06/29 22:41:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14]
2025/06/29 22:41:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75]
2025/06/29 22:41:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:41:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 23 - Minibatch ==



Average Metric: 10.00 / 35 (28.6%): 100%|██████████| 35/35 [00:08<00:00,  3.96it/s]

2025/06/29 22:41:42 INFO dspy.evaluate.evaluate: Average Metric: 10 / 35 (28.6%)
2025/06/29 22:41:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/06/29 22:41:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14, 28.57]
2025/06/29 22:41:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75]
2025/06/29 22:41:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:41:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 23 - Minibatch ==



Average Metric: 10.00 / 35 (28.6%): 100%|██████████| 35/35 [00:10<00:00,  3.38it/s]

2025/06/29 22:41:53 INFO dspy.evaluate.evaluate: Average Metric: 10 / 35 (28.6%)
2025/06/29 22:41:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/06/29 22:41:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14, 28.57, 28.57]
2025/06/29 22:41:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75]
2025/06/29 22:41:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:41:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:06<00:00,  5.66it/s] 

2025/06/29 22:41:59 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/29 22:41:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/06/29 22:41:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14, 28.57, 28.57, 20.0]
2025/06/29 22:41:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75]
2025/06/29 22:41:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:41:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:06<00:00,  5.02it/s] 

2025/06/29 22:42:06 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/29 22:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/06/29 22:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14, 28.57, 28.57, 20.0, 20.0]
2025/06/29 22:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75]
2025/06/29 22:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 23 - Full Evaluation =====
2025/06/29 22:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 22.286) from minibatch trials...



Average Metric: 28.00 / 160 (17.5%): 100%|██████████| 160/160 [00:16<00:00,  9.60it/s]

2025/06/29 22:42:23 INFO dspy.evaluate.evaluate: Average Metric: 28 / 160 (17.5%)
2025/06/29 22:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75, 17.5]
2025/06/29 22:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75
2025/06/29 22:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/29 22:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:00<00:00, 1447.65it/s]

2025/06/29 22:42:24 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/29 22:42:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/06/29 22:42:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14, 28.57, 28.57, 20.0, 20.0, 20.0]
2025/06/29 22:42:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75, 17.5]
2025/06/29 22:42:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:42:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 23 - Minibatch ==



Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:09<00:00,  3.75it/s]

2025/06/29 22:42:33 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/06/29 22:42:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5'].
2025/06/29 22:42:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14, 28.57, 28.57, 20.0, 20.0, 20.0, 14.29]
2025/06/29 22:42:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75, 17.5]
2025/06/29 22:42:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:42:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 23 - Minibatch ==



Average Metric: 9.00 / 35 (25.7%): 100%|██████████| 35/35 [00:10<00:00,  3.31it/s] 

2025/06/29 22:42:44 INFO dspy.evaluate.evaluate: Average Metric: 9 / 35 (25.7%)
2025/06/29 22:42:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 25.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2'].
2025/06/29 22:42:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 8.57, 20.0, 20.0, 20.0, 14.29, 20.0, 22.86, 17.14, 8.57, 17.14, 28.57, 28.57, 20.0, 20.0, 20.0, 14.29, 25.71]
2025/06/29 22:42:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75, 17.5]
2025/06/29 22:42:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75


2025/06/29 22:42:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 23 - Full Evaluation =====
2025/06/29 22:42:44 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 18.286) from minibatch trials...



Average Metric: 29.00 / 160 (18.1%): 100%|██████████| 160/160 [00:12<00:00, 12.80it/s] 

2025/06/29 22:42:56 INFO dspy.evaluate.evaluate: Average Metric: 29 / 160 (18.1%)
2025/06/29 22:42:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.88, 16.88, 18.75, 17.5, 18.12]
2025/06/29 22:42:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 18.75
2025/06/29 22:42:56 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/29 22:42:56 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 18.75!





In [20]:
evaluate(optimized_answer)

Average Metric: 9.00 / 50 (18.0%): 100%|██████████| 50/50 [00:12<00:00,  3.86it/s]

2025/06/29 22:47:43 INFO dspy.evaluate.evaluate: Average Metric: 9 / 50 (18.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,correct_answers,explanation,evaluation_metric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]","[Boys, Girls]",The gender composition of Wirral Grammar School is split into two ...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"[December 14, 1846, June 4, 1873, January 10, 1901]",There are three different individuals named Harry Harvey mentioned...,✔️ [1]


18.0