In [1]:
import dspy
from decouple import config
from typing import Literal, List

OPENROUTER_API_KEY = config("OPENROUTER_API_KEY")

In [2]:
lm = dspy.LM("openrouter/qwen/qwen-2.5-72b-instruct", api_key=OPENROUTER_API_KEY, api_base="https://openrouter.ai/api/v1",)
dspy.configure(lm=lm)
# lm = dspy.LM("openai/qwen/qwen-2.5-72b-instruct",
#              api_base="https://api.runpod.ai/v2/kxktu8q9taf7nk/run",  # ensure this points to your port
#              api_key="rpa_81U1YO669EZ3Z46N1E48B97YSMXV0ZAL3FPC5HYLq7z6rg")
# dspy.configure(lm=lm)

In [3]:
from typing import Literal

class Classify(dspy.Signature):
    """Classify sentiment of a given sentence."""

    sentence: str = dspy.InputField()
    sentiment: Literal["positive", "negative", "neutral"] = dspy.OutputField()
    confidence: float = dspy.OutputField()

classify = dspy.Predict(Classify)
classify(sentence="This book was super fun to read, though not the last chapter.")

Prediction(
    sentiment='positive',
    confidence=0.85
)

In [4]:
class Answer(dspy.Signature):
    """Answer a question based on the provided documents."""

    question: str = dspy.InputField()
    context: str = dspy.InputField()
    correct_answers: List[str] = dspy.OutputField()
    explanation: str = dspy.OutputField()

In [5]:
answer = dspy.ChainOfThought(Answer)

### Processing data into dspy examples

In [6]:
import ujson

file_path = "/home/darth/Documents/code/ramdocs/RAMDocs/src/data/split_data/RAMDocs_test_train.jsonl"

data = []
with open(file_path) as f:
    for line in f:
        line = ujson.loads(line)
        line_data = {
            'question': line['question'],
            'context': "\n".join(["Document " + str(i+1) + ": " + doc['text'] for i, doc in enumerate(line['documents'])]),
            "documents": line['documents'],
            "disambig_entity": line['disambig_entity'],
            "gold_answers": line['gold_answers'],
            "wrong_answers": line['wrong_answers']
        }
        data.append(line_data)
    # data = [ujson.loads(line) for line in f]

In [7]:
data[0]

{'question': 'What is the profession of C. Mayer?',
 'context': 'Document 1: Christa Mayer Christa Mayer is a German operatic mezzo-soprano. She is particularly known for her portrayal of Erda in Richard Wagner\'s "Ring Cycle"; a role which she has performed several times at the Bayreuth Festival and recorded on the BBC Legends Record Label. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from the Bavarian Academy of Singing, she pursued further studies at the Munich Academy of Music where she was a pupil of tenor Thomas Moser. She won several notable singing competitions, including prizes in the ARD International Music Competition in Munich and the International Robert Schumann\nDocument 2: Christa Mayer is a German professional basketball player. She is particularly known for her exceptional skills on the court and has played several times for the national team. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from 

In [8]:
data = [dspy.Example(**d).with_inputs('question', 'context') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [9]:
import random

random.Random(0).shuffle(data)
trainset, devset = data[:200], data[200:1000]

len(trainset), len(devset)

(200, 50)

### Baseline evaluation

In [10]:
pred = answer(**example.inputs())

In [11]:
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [12]:
pred

Prediction(
    reasoning='The population of Sandusky Township, Ohio, is provided in two different documents. Document 1 states the population was 459 as of the 2010 census, while Document 3 and Document 2 both indicate the population was 1,234 as of the 2020 census. The most recent and accurate population figure is 1,234 from the 2020 census.',
    correct_answers=['1,234'],
    explanation='The population of Sandusky Township, Ohio, as of the 2020 census, is 1,234. This figure is the most recent and accurate based on the provided documents.'
)

In [13]:
import unicodedata
import re
import string
from dspy.primitives import Module

# def normalize_text(s):
#     s = unicodedata.normalize("NFD", s)

#     def remove_articles(text):
#         return re.sub(r"\b(a|an|the)\b", " ", text)

#     def white_space_fix(text):
#         return " ".join(text.split())

#     def remove_punc(text):
#         exclude = set(string.punctuation)
#         return "".join(ch for ch in text if ch not in exclude)

#     def lower(text):
#         return text.lower()

#     return white_space_fix(remove_articles(remove_punc(lower(s))))

# def evaluation_metric(example, pred):
#     llm_answers = [normalize_text(answer) for answer in pred.correct_answers]
#     gold_answers = [normalize_text(answer) for answer in example.gold_answers]
#     wrong_answers = [normalize_text(answer) for answer in example.wrong_answers]
    
#     # Check if llm_answers contains all gold answers and no wrong answers
#     all_gold_included = all(gold in llm_answers for gold in gold_answers)
#     no_wrong_included = all(wrong not in llm_answers for wrong in wrong_answers)
    
#     if all_gold_included and no_wrong_included:
#         score = 1
#     else:
#         score = 0
#     return score

class EvaluationMetric(Module):
    def __init__(self):
        pass

    def normalize_text(self, s):
        s = unicodedata.normalize("NFD", s)

        def remove_articles(text):
            return re.sub(r"\b(a|an|the)\b", " ", text)

        def white_space_fix(text):
            return " ".join(text.split())

        def remove_punc(text):
            exclude = set(string.punctuation)
            return "".join(ch for ch in text if ch not in exclude)

        def lower(text):
            return text.lower()

        return white_space_fix(remove_articles(remove_punc(lower(s))))

    def forward(self, example, pred, trace=None):
        llm_answers = [self.normalize_text(answer) for answer in pred.correct_answers]
        gold_answers = [self.normalize_text(answer) for answer in example.gold_answers]
        wrong_answers = [self.normalize_text(answer) for answer in example.wrong_answers]
        
        # Check if llm_answers contains all gold answers and no wrong answers
        all_gold_included = all(gold in llm_answers for gold in gold_answers)
        no_wrong_included = all(wrong not in llm_answers for wrong in wrong_answers)
        
        if all_gold_included and no_wrong_included:
            score = 1
        else:
            score = 0
        return score


evaluation_metric = EvaluationMetric()

evaluation_metric(example, pred)

0

In [14]:
evaluate = dspy.Evaluate(devset=devset, metric=evaluation_metric, num_threads=24,
                         display_progress=True, display_table=2)

evaluate(answer)

Average Metric: 9.00 / 50 (18.0%): 100%|██████████| 50/50 [00:00<00:00, 1459.10it/s]

2025/07/03 23:29:38 INFO dspy.evaluate.evaluate: Average Metric: 9 / 50 (18.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,reasoning,correct_answers,explanation,EvaluationMetric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]",The context provided includes information about two schools: Wirra...,"[Boys and Girls, but in separate schools]",Wirral Grammar School consists of two separate institutions: Wirra...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"The question asks for the birth date of Harry Harvey. However, the...","[December 14, 1846, June 4, 1873, January 10, 1901]",There are three different Harry Harveys mentioned in the documents...,✔️ [1]


18.0

### Optimizer

In [15]:
tp = dspy.MIPROv2(metric=evaluation_metric, auto="medium", num_threads=24)  # use fewer threads if your rate limit is small

optimized_answer = tp.compile(answer, trainset=trainset,
                           max_bootstrapped_demos=2, max_labeled_demos=2,
                           requires_permission_to_run=False)

2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: True
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 160

2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


  from .autonotebook import tqdm as notebook_tqdm
 35%|███▌      | 14/40 [00:00<00:00, 31.90it/s]


Bootstrapped 2 full traces after 14 examples for up to 1 rounds, amounting to 14 attempts.
Bootstrapping set 4/12


  5%|▌         | 2/40 [00:00<00:00, 585.88it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/12


 12%|█▎        | 5/40 [00:00<00:00, 474.69it/s]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 6/12


  2%|▎         | 1/40 [00:00<00:00, 559.24it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 7/12


  5%|▌         | 2/40 [00:00<00:00, 424.37it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 8/12


  2%|▎         | 1/40 [00:00<00:00, 404.31it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 9/12


 10%|█         | 4/40 [00:00<00:00, 556.51it/s]


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 10/12


  8%|▊         | 3/40 [00:00<00:00, 552.80it/s]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 11/12


  5%|▌         | 2/40 [00:00<00:00, 496.10it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 12/12


  2%|▎         | 1/40 [00:00<00:00, 411.33it/s]
2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Error getting source code: unhashable type: 'dict'.

Running without program aware proposer.


2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Answer a question based on the provided documents.

2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are a journalist on a tight deadline writing a critical article about a famous sports figure or artist. Your editor demands accuracy, and any incorrect information could lead to a retraction and damage your reputation. Answer the following question based on the provided documents, ensuring that you carefully consider all the information and potential disambiguations.

2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a knowledgeable researcher. Answer the question based on the provided documents, ensuring that your response is accurate and well-reasoned.

2025/07/03 23:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Answer the question using the provided context, ensuring the answer

Average Metric: 25.00 / 160 (15.6%): 100%|██████████| 160/160 [00:00<00:00, 230.18it/s]

2025/07/03 23:29:40 INFO dspy.evaluate.evaluate: Average Metric: 25 / 160 (15.6%)





2025/07/03 23:29:40 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 15.62

2025/07/03 23:29:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 23 - Minibatch ==


Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:00<00:00, 449.59it/s]

2025/07/03 23:29:40 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/07/03 23:29:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/07/03 23:29:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29]
2025/07/03 23:29:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/07/03 23:29:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:00<00:00, 763.73it/s]

2025/07/03 23:29:41 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 2'].
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14]
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 23 - Minibatch ==



Average Metric: 8.00 / 35 (22.9%): 100%|██████████| 35/35 [00:00<00:00, 299.09it/s]

2025/07/03 23:29:41 INFO dspy.evaluate.evaluate: Average Metric: 8 / 35 (22.9%)
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86]
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:00<00:00, 410.42it/s]

2025/07/03 23:29:41 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0]
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 23 - Minibatch ==



Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:00<00:00, 303.46it/s] 

2025/07/03 23:29:42 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57]
2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62]
2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 23 - Full Evaluation =====
2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 22.86) from minibatch trials...



Average Metric: 20.00 / 160 (12.5%): 100%|██████████| 160/160 [00:00<00:00, 270.31it/s]

2025/07/03 23:29:42 INFO dspy.evaluate.evaluate: Average Metric: 20 / 160 (12.5%)
2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62





2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/03 23:29:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 23 - Minibatch ==


Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:00<00:00, 388.97it/s]

2025/07/03 23:29:43 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 6'].
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0]
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:00<00:00, 329.21it/s]

2025/07/03 23:29:43 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)





2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1'].
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14]
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 23 - Minibatch ==


Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:00<00:00, 358.72it/s]

2025/07/03 23:29:43 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57]
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:00<00:00, 407.55it/s]

2025/07/03 23:29:44 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9'].
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14]
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 23 - Minibatch ==



Average Metric: 6.00 / 35 (17.1%): 100%|██████████| 35/35 [00:00<00:00, 667.20it/s]

2025/07/03 23:29:44 INFO dspy.evaluate.evaluate: Average Metric: 6 / 35 (17.1%)
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 17.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14]
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5]
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 23 - Full Evaluation =====
2025/07/03 23:29:44 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 20.0) from minibatch trials...



Average Metric: 24.00 / 160 (15.0%): 100%|██████████| 160/160 [00:00<00:00, 260.46it/s]

2025/07/03 23:29:45 INFO dspy.evaluate.evaluate: Average Metric: 24 / 160 (15.0%)
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 23 - Minibatch ==



Average Metric: 11.00 / 35 (31.4%): 100%|██████████| 35/35 [00:00<00:00, 364.62it/s]

2025/07/03 23:29:45 INFO dspy.evaluate.evaluate: Average Metric: 11 / 35 (31.4%)
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 31.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43]
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 23 - Minibatch ==



Average Metric: 7.00 / 35 (20.0%): 100%|██████████| 35/35 [00:00<00:00, 433.87it/s]

2025/07/03 23:29:45 INFO dspy.evaluate.evaluate: Average Metric: 7 / 35 (20.0%)
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 5'].
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0]
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 23 - Minibatch ==



Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:00<00:00, 722.85it/s]

2025/07/03 23:29:45 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29]
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 23 - Minibatch ==



Average Metric: 5.00 / 35 (14.3%): 100%|██████████| 35/35 [00:00<00:00, 1069.38it/s]

2025/07/03 23:29:45 INFO dspy.evaluate.evaluate: Average Metric: 5 / 35 (14.3%)





2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 14.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29]
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 23 - Minibatch ==


Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:00<00:00, 345.92it/s] 

2025/07/03 23:29:46 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 8'].
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57]
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0]
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 23 - Full Evaluation =====
2025/07/03 23:29:46 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 22.86) from minibatch trials...



Average Metric: 21.00 / 160 (13.1%): 100%|██████████| 160/160 [00:00<00:00, 265.22it/s]

2025/07/03 23:29:47 INFO dspy.evaluate.evaluate: Average Metric: 21 / 160 (13.1%)





2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 23 - Minibatch ==


Average Metric: 1.00 / 35 (2.9%): 100%|██████████| 35/35 [00:00<00:00, 190.57it/s]

2025/07/03 23:29:47 INFO dspy.evaluate.evaluate: Average Metric: 1 / 35 (2.9%)





2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 2.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 7'].
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57, 2.86]
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 23 - Minibatch ==


Average Metric: 3.00 / 35 (8.6%): 100%|██████████| 35/35 [00:00<00:00, 367.48it/s]

2025/07/03 23:29:47 INFO dspy.evaluate.evaluate: Average Metric: 3 / 35 (8.6%)
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57, 2.86, 8.57]
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 23 - Minibatch ==



Average Metric: 4.00 / 35 (11.4%): 100%|██████████| 35/35 [00:00<00:00, 558.01it/s]

2025/07/03 23:29:48 INFO dspy.evaluate.evaluate: Average Metric: 4 / 35 (11.4%)
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 11.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 11'].
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [14.29, 17.14, 22.86, 20.0, 8.57, 20.0, 17.14, 8.57, 17.14, 17.14, 31.43, 20.0, 14.29, 14.29, 8.57, 2.86, 8.57, 11.43]
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12]
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62


2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 23 - Full Evaluation =====
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 20.0) from minibatch trials...



Average Metric: 22.00 / 160 (13.8%): 100%|██████████| 160/160 [00:00<00:00, 272.63it/s]

2025/07/03 23:29:48 INFO dspy.evaluate.evaluate: Average Metric: 22 / 160 (13.8%)
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [15.62, 12.5, 15.0, 13.12, 13.75]
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 15.62
2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/03 23:29:48 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 15.62!





In [16]:
evaluate(optimized_answer)

Average Metric: 9.00 / 50 (18.0%): 100%|██████████| 50/50 [00:00<00:00, 1162.99it/s]

2025/07/03 23:29:49 INFO dspy.evaluate.evaluate: Average Metric: 9 / 50 (18.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,reasoning,correct_answers,explanation,EvaluationMetric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]",The context provided includes information about two schools: Wirra...,"[Boys and Girls, but in separate schools]",Wirral Grammar School consists of two separate institutions: Wirra...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"The question asks for the birth date of Harry Harvey. However, the...","[December 14, 1846, June 4, 1873, January 10, 1901]",There are three different Harry Harveys mentioned in the documents...,✔️ [1]


18.0

In [18]:
optimized_answer.save("/home/darth/Documents/code/ramdocs/RAMDocs/src/dspy/saved_states/simba_cot", save_program=True)

In [19]:
lm.inspect_history(n=1)





[34m[2025-07-03T23:29:49.037073][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `context` (str):
Your output fields are:
1. `reasoning` (str): 
2. `correct_answers` (list[str]): 
3. `explanation` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## context ## ]]
{context}

[[ ## reasoning ## ]]
{reasoning}

[[ ## correct_answers ## ]]
{correct_answers}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## explanation ## ]]
{explanation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Answer a question based on the provided documents.


[31mUser message:[0m

[[ ## question ## ]]
Who was Chilperic's father?

[[ ## context ## ]]
Document 1: Chilperic I Chilperic I (c. 539 – September 584) was the king of Neustria (or Soissons) from 561 to his death. He was one

In [21]:
optimized_answer = dspy.load("/home/darth/Documents/code/ramdocs/RAMDocs/src/dspy/saved_states/simba_cot")