In [1]:
import dspy
from decouple import config
from typing import Literal, List

OPENROUTER_API_KEY = config("OPENROUTER_API_KEY")

In [2]:
lm = dspy.LM("openrouter/qwen/qwen-2.5-72b-instruct", api_key=OPENROUTER_API_KEY, api_base="https://openrouter.ai/api/v1",)
dspy.configure(lm=lm)

In [3]:
class Answer(dspy.Signature):
    """Answer a question based on the provided documents."""

    question: str = dspy.InputField()
    context: str = dspy.InputField()
    correct_answers: List[str] = dspy.OutputField()
    explanation: str = dspy.OutputField()

In [4]:
cot = dspy.ChainOfThought(Answer)

### Processing data into dspy examples

In [5]:
import ujson

file_path = "/home/darth/Documents/code/ramdocs/RAMDocs/src/data/split_data/RAMDocs_test_train.jsonl"

data = []
with open(file_path) as f:
    for line in f:
        line = ujson.loads(line)
        line_data = {
            'question': line['question'],
            'context': "\n".join(["Document " + str(i+1) + ": " + doc['text'] for i, doc in enumerate(line['documents'])]),
            "documents": line['documents'],
            "disambig_entity": line['disambig_entity'],
            "gold_answers": line['gold_answers'],
            "wrong_answers": line['wrong_answers']
        }
        data.append(line_data)
    # data = [ujson.loads(line) for line in f]

In [6]:
data[0]

{'question': 'What is the profession of C. Mayer?',
 'context': 'Document 1: Christa Mayer Christa Mayer is a German operatic mezzo-soprano. She is particularly known for her portrayal of Erda in Richard Wagner\'s "Ring Cycle"; a role which she has performed several times at the Bayreuth Festival and recorded on the BBC Legends Record Label. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from the Bavarian Academy of Singing, she pursued further studies at the Munich Academy of Music where she was a pupil of tenor Thomas Moser. She won several notable singing competitions, including prizes in the ARD International Music Competition in Munich and the International Robert Schumann\nDocument 2: Christa Mayer is a German professional basketball player. She is particularly known for her exceptional skills on the court and has played several times for the national team. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from 

In [7]:
data = [dspy.Example(**d).with_inputs('question', 'context') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [8]:
import random

random.Random(0).shuffle(data)
trainset, devset = data[:200], data[200:1000]

len(trainset), len(devset)

(200, 50)

### Baseline evaluation

In [9]:
pred = cot(**example.inputs())

In [10]:
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [11]:
pred

Prediction(
    reasoning='The population of Sandusky Township, Ohio, is provided in two different documents. Document 1 states the population was 459 as of the 2010 census, while Document 3 and Document 2 both indicate the population was 1,234 as of the 2020 census. The most recent and accurate population figure is 1,234 from the 2020 census.',
    correct_answers=['1,234'],
    explanation='The population of Sandusky Township, Ohio, as of the 2020 census, is 1,234. This figure is the most recent and accurate based on the provided documents.'
)

In [12]:
import unicodedata
import re
import string
from dspy.primitives import Module

class EvaluationMetric(Module):
    def __init__(self):
        pass

    def normalize_text(self, s):
        s = unicodedata.normalize("NFD", s)

        def remove_articles(text):
            return re.sub(r"\b(a|an|the)\b", " ", text)

        def white_space_fix(text):
            return " ".join(text.split())

        def remove_punc(text):
            exclude = set(string.punctuation)
            return "".join(ch for ch in text if ch not in exclude)

        def lower(text):
            return text.lower()

        return white_space_fix(remove_articles(remove_punc(lower(s))))

    def forward(self, example, pred, trace=None):
        llm_answers = [self.normalize_text(answer) for answer in pred.correct_answers]
        gold_answers = [self.normalize_text(answer) for answer in example.gold_answers]
        wrong_answers = [self.normalize_text(answer) for answer in example.wrong_answers]
        
        # Check if llm_answers contains all gold answers and no wrong answers
        all_gold_included = all(gold in llm_answers for gold in gold_answers)
        no_wrong_included = all(wrong not in llm_answers for wrong in wrong_answers)
        
        if all_gold_included and no_wrong_included:
            score = 1
        else:
            score = 0
        return score


evaluation_metric = EvaluationMetric()

evaluation_metric(example, pred)

0

In [18]:
evaluate = dspy.Evaluate(devset=devset, metric=evaluation_metric, num_threads=24,
                         display_progress=True, display_table=True)

evaluate(cot)

Average Metric: 9.00 / 50 (18.0%): 100%|██████████| 50/50 [00:00<00:00, 1270.19it/s]

2025/06/30 01:32:15 INFO dspy.evaluate.evaluate: Average Metric: 9 / 50 (18.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,reasoning,correct_answers,explanation,EvaluationMetric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]",The context provided includes information about two schools: Wirra...,"[Boys and Girls, but in separate schools]",Wirral Grammar School consists of two separate institutions: Wirra...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"The question asks for the birth date of Harry Harvey. However, the...","[December 14, 1846, June 4, 1873, January 10, 1901]",There are three different Harry Harveys mentioned in the documents...,✔️ [1]
2,What is the location of the Central Square Historic District?,"Document 1: Central Square Historic District (Bristol, New Hampshi...","[{'text': 'Central Square Historic District (Bristol, New Hampshir...","[Central Square Historic District (Bristol, New Hampshire), Centra...","[Bristol, New Hampshire, Stoneham, Massachusetts, Waltham, Massach...",[],The question asks for the location of the Central Square Historic ...,"[Bristol, New Hampshire, Stoneham, Massachusetts]","Both Bristol, New Hampshire, and Stoneham, Massachusetts, have a C...",
3,What type of aircraft was involved in Flight 109?,"Document 1: Aeroflot Flight 109 Aeroflot Flight 109 ( ""Reys 109 Ae...","[{'text': 'Aeroflot Flight 109 Aeroflot Flight 109 ( ""Reys 109 Aer...","[Aeroflot Flight 109, Sudan Airways Flight 109]","[Tupolev Tu-104A, Airbus A310-324]",[],The question asks about the type of aircraft involved in Flight 10...,[Tupolev Tu-104A],The aircraft involved in Aeroflot Flight 109 was a Tupolev Tu-104A...,
4,When was Princess Elisabeth of Thurn and Taxis born?,Document 1: Princess Elisabeth Helene of Thurn and Taxis Princess ...,[{'text': 'Princess Elisabeth Helene of Thurn and Taxis Princess E...,"[Princess Elisabeth Helene of Thurn and Taxis, Princess Elisabeth ...","[15 December 1903, 24 March 1982, 28 May 1860]",[],The question asks for the birth date of Princess Elisabeth of Thur...,[15 December 1903],"The correct birth date of Princess Elisabeth of Thurn and Taxis, a...",
5,What is the length of the Moravica river?,Document 1: Golijska Moravica The Golijska Moravica or simply Mora...,[{'text': 'Golijska Moravica The Golijska Moravica or simply Morav...,[Golijska Moravica],[98 km],[],The length of the Moravica river is consistently reported as 98 km...,[98 km],"The Moravica river, also known as Golijska Moravica, has a length ...",✔️ [1]
6,When did Akarin debut?,Document 1: Akari Suda Suda passed SKE48's 3rd generation audition...,[{'text': 'Akari Suda Suda passed SKE48\'s 3rd generation audition...,"[Akari Suda, Akari Yoshida, Akari Hayami]","[8 December 2009, October 9, 2010, 2008]",[15 January 2010],The context provided in Document 1 and Document 2 both mention Aka...,[15 January 2010],"The correct debut date for Akarin (Akari Suda) is 15 January 2010,...",
7,When was Joseph Lafontaine born?,Document 1: Joseph Lafontaine (Berthier MLA) Joseph Lafontaine (No...,"[{'text': ""Joseph Lafontaine (Berthier MLA) Joseph Lafontaine (Nov...",[Joseph Lafontaine (Berthier MLA)],"[November 25, 1865]","[March 15, 1870]",The birth date of Joseph Lafontaine is mentioned in both Document ...,"[November 25, 1865, March 15, 1870]",Both documents provide different birth dates for Joseph Lafontaine...,
8,What is the founding date of the Revolutionary Left Movement?,Document 1: Revolutionary Left Movement (Bolivia) The Revolutionar...,[{'text': 'Revolutionary Left Movement (Bolivia) The Revolutionary...,"[Revolutionary Left Movement (Bolivia), Revolutionary Left Movemen...","[1971, 1962, 12 October 1965]",[],The question asks for the founding date of the Revolutionary Left ...,[7 September 1971],The founding date of the Revolutionary Left Movement (MIR) in Boli...,
9,What is the location of Mooretown?,"Document 1: Moore Town, Jamaica Moore Town is a Maroon settlement ...","[{'text': 'Moore Town, Jamaica Moore Town is a Maroon settlement l...","[Moore Town, Jamaica, Mooretown Rancheria of Maidu Indians]","[Portland, Jamaica, Butte County, California]",[],The question asks for the location of Mooretown. From the provided...,"[Moore Town, Jamaica, Mooretown Rancheria, California]",There are two locations named Mooretown mentioned in the context. ...,


18.0

### Optimizer

In [14]:
tp = dspy.MIPROv2(metric=evaluation_metric, auto="medium", num_threads=24)  # use fewer threads if your rate limit is small

optimized_answer = tp.compile(cot, trainset=trainset,
                           max_bootstrapped_demos=2, max_labeled_demos=2,
                           requires_permission_to_run=False)

2025/06/30 00:56:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: True
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 160

2025/06/30 00:56:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/06/30 00:56:15 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/06/30 00:56:15 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


  from .autonotebook import tqdm as notebook_tqdm
 35%|███▌      | 14/40 [02:12<04:05,  9.44s/it]


Bootstrapped 2 full traces after 14 examples for up to 1 rounds, amounting to 14 attempts.
Bootstrapping set 4/12


  5%|▌         | 2/40 [00:13<04:23,  6.93s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/12


 12%|█▎        | 5/40 [00:13<01:36,  2.76s/it]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 6/12


  2%|▎         | 1/40 [00:00<00:00, 373.79it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 7/12


  5%|▌         | 2/40 [00:15<04:55,  7.77s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 8/12


  2%|▎         | 1/40 [00:05<03:32,  5.44s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 9/12


 10%|█         | 4/40 [00:12<01:48,  3.01s/it]


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 10/12


  8%|▊         | 3/40 [00:13<02:46,  4.51s/it]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 11/12


  5%|▌         | 2/40 [00:11<03:33,  5.63s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 12/12


  2%|▎         | 1/40 [00:00<00:00, 520.64it/s]
2025/06/30 00:59:53 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/06/30 00:59:53 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/06/30 00:59:53 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Error getting source code: unhashable type: 'dict'.

Running without program aware proposer.


2025/06/30 01:00:07 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/06/30 01:00:07 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Answer a question based on the provided documents.

2025/06/30 01:00:07 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are a journalist on a tight deadline writing a critical article about a famous sports figure or artist. Your editor demands accuracy, and any incorrect information could lead to a retraction and damage your reputation. Answer the following question based on the provided documents, ensuring that you carefully consider all the information and potential disambiguations.

2025/06/30 01:00:07 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a knowledgeable researcher. Answer the question based on the provided documents, ensuring that your response is accurate and well-reasoned.

2025/06/30 01:00:07 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Answer the question using the provided context, ensuring the answer

Average Metric: 25.00 / 160 (15.6%): 100%|██████████| 160/160 [01:07<00:00,  2.38it/s]

2025/06/30 01:01:14 INFO dspy.evaluate.evaluate: Average Metric: 25 / 160 (15.6%)
2025/06/30 01:01:14 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 15.62

2025/06/30 01:01:14 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 23 - Minibatch ==



Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:18<00:00,  1.93it/s]

2025/06/30 01:01:33 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/06/30 01:01:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/06/30 01:01:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29]
2025/06/30 01:01:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/06/30 01:01:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:01:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:19<00:00,  1.77it/s]

2025/06/30 01:01:53 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/06/30 01:01:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 2'].
2025/06/30 01:01:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14]
2025/06/30 01:01:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/06/30 01:01:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:01:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 23 - Minibatch ==



Average Metric: 8.00 / 35 (22.9%): 100%|██████████| 35/35 [00:18<00:00,  1.85it/s]

2025/06/30 01:02:12 INFO dspy.evaluate.evaluate: Average Metric: 8 / 35 (22.9%)
2025/06/30 01:02:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/06/30 01:02:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86]
2025/06/30 01:02:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/06/30 01:02:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:02:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:18<00:00,  1.89it/s]

2025/06/30 01:02:30 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/30 01:02:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/06/30 01:02:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0]
2025/06/30 01:02:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/06/30 01:02:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:02:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 23 - Minibatch ==



Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:19<00:00,  1.79it/s] 

2025/06/30 01:02:50 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/06/30 01:02:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2025/06/30 01:02:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57]
2025/06/30 01:02:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/06/30 01:02:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:02:50 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 23 - Full Evaluation =====
2025/06/30 01:02:50 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 22.86) from minibatch trials...



Average Metric: 20.00 / 160 (12.5%): 100%|██████████| 160/160 [00:47<00:00,  3.37it/s]

2025/06/30 01:03:38 INFO dspy.evaluate.evaluate: Average Metric: 20 / 160 (12.5%)
2025/06/30 01:03:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/06/30 01:03:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62
2025/06/30 01:03:38 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/30 01:03:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:16<00:00,  2.11it/s]

2025/06/30 01:03:54 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/30 01:03:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 6'].
2025/06/30 01:03:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0]
2025/06/30 01:03:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/06/30 01:03:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:03:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:13<00:00,  2.55it/s]

2025/06/30 01:04:08 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)





2025/06/30 01:04:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1'].
2025/06/30 01:04:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14]
2025/06/30 01:04:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/06/30 01:04:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:04:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 23 - Minibatch ==


Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:15<00:00,  2.20it/s] 

2025/06/30 01:04:24 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/06/30 01:04:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/06/30 01:04:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57]
2025/06/30 01:04:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/06/30 01:04:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:04:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:15<00:00,  2.26it/s]

2025/06/30 01:04:40 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/06/30 01:04:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9'].
2025/06/30 01:04:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14]
2025/06/30 01:04:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/06/30 01:04:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:04:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:16<00:00,  2.06it/s]

2025/06/30 01:04:57 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)





2025/06/30 01:04:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/06/30 01:04:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14]
2025/06/30 01:04:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/06/30 01:04:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:04:57 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 23 - Full Evaluation =====
2025/06/30 01:04:57 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 20.0) from minibatch trials...


Average Metric: 24.00 / 160 (15.0%): 100%|██████████| 160/160 [00:44<00:00,  3.61it/s]

2025/06/30 01:05:42 INFO dspy.evaluate.evaluate: Average Metric: 24 / 160 (15.0%)
2025/06/30 01:05:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/06/30 01:05:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62
2025/06/30 01:05:42 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/30 01:05:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 23 - Minibatch ==



Average Metric: 11.00 / 35 (31.4%): 100%|██████████| 35/35 [00:23<00:00,  1.52it/s]

2025/06/30 01:06:05 INFO dspy.evaluate.evaluate: Average Metric: 11 / 35 (31.4%)
2025/06/30 01:06:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 31.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/06/30 01:06:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43]
2025/06/30 01:06:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/06/30 01:06:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:06:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:22<00:00,  1.55it/s]

2025/06/30 01:06:28 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/06/30 01:06:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 5'].
2025/06/30 01:06:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0]
2025/06/30 01:06:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/06/30 01:06:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:06:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 23 - Minibatch ==



Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:16<00:00,  2.08it/s]

2025/06/30 01:06:45 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29]
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 23 - Minibatch ==



Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:00<00:00, 1615.73it/s]

2025/06/30 01:06:45 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29]
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:06:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 23 - Minibatch ==



Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:15<00:00,  2.22it/s] 

2025/06/30 01:07:01 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/06/30 01:07:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 8'].
2025/06/30 01:07:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57]
2025/06/30 01:07:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/06/30 01:07:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:07:01 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 23 - Full Evaluation =====
2025/06/30 01:07:01 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 22.86) from minibatch trials...



Average Metric: 21.00 / 160 (13.1%): 100%|██████████| 160/160 [00:37<00:00,  4.23it/s]

2025/06/30 01:07:39 INFO dspy.evaluate.evaluate: Average Metric: 21 / 160 (13.1%)
2025/06/30 01:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/06/30 01:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62
2025/06/30 01:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/30 01:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 23 - Minibatch ==



Average Metric: 1.00 / 35 (2.9%): 100%|██████████| 35/35 [00:22<00:00,  1.54it/s]

2025/06/30 01:08:02 INFO dspy.evaluate.evaluate: Average Metric: 1 / 35 (2.9%)
2025/06/30 01:08:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 2.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 7'].
2025/06/30 01:08:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57, 2.86]
2025/06/30 01:08:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/06/30 01:08:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:08:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 23 - Minibatch ==



Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:28<00:00,  1.23it/s] 

2025/06/30 01:08:30 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/06/30 01:08:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/06/30 01:08:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57, 2.86, 8.57]
2025/06/30 01:08:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/06/30 01:08:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:08:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 23 - Minibatch ==



Average Metric: 4.00 / 35 (11.4%): 100%|██████████| 35/35 [00:22<00:00,  1.55it/s]

2025/06/30 01:08:53 INFO dspy.evaluate.evaluate: Average Metric: 4 / 35 (11.4%)
2025/06/30 01:08:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 11.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 11'].
2025/06/30 01:08:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57, 2.86, 8.57, 11.43]
2025/06/30 01:08:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/06/30 01:08:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/06/30 01:08:53 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 23 - Full Evaluation =====
2025/06/30 01:08:53 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 20.0) from minibatch trials...



Average Metric: 22.00 / 160 (13.8%): 100%|██████████| 160/160 [00:51<00:00,  3.12it/s]

2025/06/30 01:09:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 160 (13.8%)
2025/06/30 01:09:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12, 13.75]
2025/06/30 01:09:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62
2025/06/30 01:09:44 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/30 01:09:44 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 15.62!





In [19]:
evaluate(optimized_answer)

Average Metric: 9.00 / 50 (18.0%): 100%|██████████| 50/50 [00:00<00:00, 1512.88it/s]

2025/06/30 01:32:26 INFO dspy.evaluate.evaluate: Average Metric: 9 / 50 (18.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,reasoning,correct_answers,explanation,EvaluationMetric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]",The context provided includes information about two schools: Wirra...,"[Boys and Girls, but in separate schools]",Wirral Grammar School consists of two separate institutions: Wirra...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"The question asks for the birth date of Harry Harvey. However, the...","[December 14, 1846, June 4, 1873, January 10, 1901]",There are three different Harry Harveys mentioned in the documents...,✔️ [1]
2,What is the location of the Central Square Historic District?,"Document 1: Central Square Historic District (Bristol, New Hampshi...","[{'text': 'Central Square Historic District (Bristol, New Hampshir...","[Central Square Historic District (Bristol, New Hampshire), Centra...","[Bristol, New Hampshire, Stoneham, Massachusetts, Waltham, Massach...",[],The question asks for the location of the Central Square Historic ...,"[Bristol, New Hampshire, Stoneham, Massachusetts]","Both Bristol, New Hampshire, and Stoneham, Massachusetts, have a C...",
3,What type of aircraft was involved in Flight 109?,"Document 1: Aeroflot Flight 109 Aeroflot Flight 109 ( ""Reys 109 Ae...","[{'text': 'Aeroflot Flight 109 Aeroflot Flight 109 ( ""Reys 109 Aer...","[Aeroflot Flight 109, Sudan Airways Flight 109]","[Tupolev Tu-104A, Airbus A310-324]",[],The question asks about the type of aircraft involved in Flight 10...,[Tupolev Tu-104A],The aircraft involved in Aeroflot Flight 109 was a Tupolev Tu-104A...,
4,When was Princess Elisabeth of Thurn and Taxis born?,Document 1: Princess Elisabeth Helene of Thurn and Taxis Princess ...,[{'text': 'Princess Elisabeth Helene of Thurn and Taxis Princess E...,"[Princess Elisabeth Helene of Thurn and Taxis, Princess Elisabeth ...","[15 December 1903, 24 March 1982, 28 May 1860]",[],The question asks for the birth date of Princess Elisabeth of Thur...,[15 December 1903],"The correct birth date of Princess Elisabeth of Thurn and Taxis, a...",
5,What is the length of the Moravica river?,Document 1: Golijska Moravica The Golijska Moravica or simply Mora...,[{'text': 'Golijska Moravica The Golijska Moravica or simply Morav...,[Golijska Moravica],[98 km],[],The length of the Moravica river is consistently reported as 98 km...,[98 km],"The Moravica river, also known as Golijska Moravica, has a length ...",✔️ [1]
6,When did Akarin debut?,Document 1: Akari Suda Suda passed SKE48's 3rd generation audition...,[{'text': 'Akari Suda Suda passed SKE48\'s 3rd generation audition...,"[Akari Suda, Akari Yoshida, Akari Hayami]","[8 December 2009, October 9, 2010, 2008]",[15 January 2010],The context provided in Document 1 and Document 2 both mention Aka...,[15 January 2010],"The correct debut date for Akarin (Akari Suda) is 15 January 2010,...",
7,When was Joseph Lafontaine born?,Document 1: Joseph Lafontaine (Berthier MLA) Joseph Lafontaine (No...,"[{'text': ""Joseph Lafontaine (Berthier MLA) Joseph Lafontaine (Nov...",[Joseph Lafontaine (Berthier MLA)],"[November 25, 1865]","[March 15, 1870]",The birth date of Joseph Lafontaine is mentioned in both Document ...,"[November 25, 1865, March 15, 1870]",Both documents provide different birth dates for Joseph Lafontaine...,
8,What is the founding date of the Revolutionary Left Movement?,Document 1: Revolutionary Left Movement (Bolivia) The Revolutionar...,[{'text': 'Revolutionary Left Movement (Bolivia) The Revolutionary...,"[Revolutionary Left Movement (Bolivia), Revolutionary Left Movemen...","[1971, 1962, 12 October 1965]",[],The question asks for the founding date of the Revolutionary Left ...,[7 September 1971],The founding date of the Revolutionary Left Movement (MIR) in Boli...,
9,What is the location of Mooretown?,"Document 1: Moore Town, Jamaica Moore Town is a Maroon settlement ...","[{'text': 'Moore Town, Jamaica Moore Town is a Maroon settlement l...","[Moore Town, Jamaica, Mooretown Rancheria of Maidu Indians]","[Portland, Jamaica, Butte County, California]",[],The question asks for the location of Mooretown. From the provided...,"[Moore Town, Jamaica, Mooretown Rancheria, California]",There are two locations named Mooretown mentioned in the context. ...,


18.0

In [16]:
optimized_answer.save("/home/darth/Documents/code/ramdocs/RAMDocs/src/dspy/saved_states/miro_v2_cot.json")

In [17]:
lm.inspect_history(n=1)





[34m[2025-06-30T01:21:15.106917][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `context` (str):
Your output fields are:
1. `reasoning` (str): 
2. `correct_answers` (list[str]): 
3. `explanation` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## context ## ]]
{context}

[[ ## reasoning ## ]]
{reasoning}

[[ ## correct_answers ## ]]
{correct_answers}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## explanation ## ]]
{explanation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Answer a question based on the provided documents.


[31mUser message:[0m

[[ ## question ## ]]
When was Cove Fort built?

[[ ## context ## ]]
Document 1: Cove Fort Cove Fort is a fort and historical site located in Millard County, Utah. It was founded in 1867 by Ira Hinckley (