In [1]:
import dspy
from decouple import config
from typing import Literal, List

OPENROUTER_API_KEY = config("OPENROUTER_API_KEY")

In [2]:
lm = dspy.LM("openrouter/qwen/qwen-2.5-72b-instruct", api_key=OPENROUTER_API_KEY, api_base="https://openrouter.ai/api/v1",)
dspy.configure(lm=lm)

In [3]:
class Answer(dspy.Signature):
    """Answer a question based on the provided documents."""

    question: str = dspy.InputField()
    context: str = dspy.InputField()
    correct_answers: List[str] = dspy.OutputField()
    explanation: str = dspy.OutputField()

In [4]:
answer = dspy.Predict(Answer)

### Processing data into dspy examples

In [5]:
import ujson

file_path = "/home/darth/Documents/code/ramdocs/RAMDocs/src/data/split_data/RAMDocs_test_train.jsonl"

data = []
with open(file_path) as f:
    for line in f:
        line = ujson.loads(line)
        line_data = {
            'question': line['question'],
            'context': "\n".join(["Document " + str(i+1) + ": " + doc['text'] for i, doc in enumerate(line['documents'])]),
            "documents": line['documents'],
            "disambig_entity": line['disambig_entity'],
            "gold_answers": line['gold_answers'],
            "wrong_answers": line['wrong_answers']
        }
        data.append(line_data)
    # data = [ujson.loads(line) for line in f]

In [6]:
data[0]

{'question': 'What is the profession of C. Mayer?',
 'context': 'Document 1: Christa Mayer Christa Mayer is a German operatic mezzo-soprano. She is particularly known for her portrayal of Erda in Richard Wagner\'s "Ring Cycle"; a role which she has performed several times at the Bayreuth Festival and recorded on the BBC Legends Record Label. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from the Bavarian Academy of Singing, she pursued further studies at the Munich Academy of Music where she was a pupil of tenor Thomas Moser. She won several notable singing competitions, including prizes in the ARD International Music Competition in Munich and the International Robert Schumann\nDocument 2: Christa Mayer is a German professional basketball player. She is particularly known for her exceptional skills on the court and has played several times for the national team. Christa Mayer was born in Sulzbach-Rosenberg, Germany in Bavaria. After graduating from 

In [7]:
data = [dspy.Example(**d).with_inputs('question', 'context') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [8]:
import random

random.Random(0).shuffle(data)
trainset, devset = data[:200], data[200:1000]

len(trainset), len(devset)

(200, 50)

### Baseline evaluation

In [9]:
pred = answer(**example.inputs())

In [10]:
example

Example({'question': 'What is the population of Sandusky Township, Ohio?', 'context': "Document 1: Sandusky Township, Crawford County, Ohio Sandusky Township is one of the sixteen townships of Crawford County, Ohio, United States. As of the 2010 census the population was 459. Located in the eastern part of the county, it borders the following townships: No municipalities are located in Sandusky Township. Sandusky Township was named from the Sandusky River, which flows through its southern part. Statewide, other Sandusky Townships are located in Richland and Sandusky counties. The township is governed by a three-member board of trustees, who are elected in November of odd-numbered years to a four-year term beginning on the following January\nDocument 2: from Wikimedia project Spanish Wikipedia located in the administrative territorial entity Crawford County 1 reference imported from Wikimedia project English Wikipedia coordinate location 40°51'5 '' N, 82°49'22 '' W 1 reference imported 

In [11]:
pred

Prediction(
    correct_answers=['1,234'],
    explanation='The population of Sandusky Township, Crawford County, Ohio, as of the 2020 census, was 1,234. This information is provided in Document 2 and Document 3, with Document 3 confirming the population figure from the 2010 census as well.'
)

In [12]:
import unicodedata
import re
import string
from dspy.primitives import Module

# def normalize_text(s):
#     s = unicodedata.normalize("NFD", s)

#     def remove_articles(text):
#         return re.sub(r"\b(a|an|the)\b", " ", text)

#     def white_space_fix(text):
#         return " ".join(text.split())

#     def remove_punc(text):
#         exclude = set(string.punctuation)
#         return "".join(ch for ch in text if ch not in exclude)

#     def lower(text):
#         return text.lower()

#     return white_space_fix(remove_articles(remove_punc(lower(s))))

# def evaluation_metric(example, pred):
#     llm_answers = [normalize_text(answer) for answer in pred.correct_answers]
#     gold_answers = [normalize_text(answer) for answer in example.gold_answers]
#     wrong_answers = [normalize_text(answer) for answer in example.wrong_answers]
    
#     # Check if llm_answers contains all gold answers and no wrong answers
#     all_gold_included = all(gold in llm_answers for gold in gold_answers)
#     no_wrong_included = all(wrong not in llm_answers for wrong in wrong_answers)
    
#     if all_gold_included and no_wrong_included:
#         score = 1
#     else:
#         score = 0
#     return score

class EvaluationMetric(Module):
    def __init__(self):
        pass

    def normalize_text(self, s):
        s = unicodedata.normalize("NFD", s)

        def remove_articles(text):
            return re.sub(r"\b(a|an|the)\b", " ", text)

        def white_space_fix(text):
            return " ".join(text.split())

        def remove_punc(text):
            exclude = set(string.punctuation)
            return "".join(ch for ch in text if ch not in exclude)

        def lower(text):
            return text.lower()

        return white_space_fix(remove_articles(remove_punc(lower(s))))

    def forward(self, example, pred, trace=None):
        llm_answers = [self.normalize_text(answer) for answer in pred.correct_answers]
        gold_answers = [self.normalize_text(answer) for answer in example.gold_answers]
        wrong_answers = [self.normalize_text(answer) for answer in example.wrong_answers]
        
        # Check if llm_answers contains all gold answers and no wrong answers
        all_gold_included = all(gold in llm_answers for gold in gold_answers)
        no_wrong_included = all(wrong not in llm_answers for wrong in wrong_answers)
        
        if all_gold_included and no_wrong_included:
            score = 1
        else:
            score = 0
        return score


evaluation_metric = EvaluationMetric()

evaluation_metric(example, pred)

0

In [13]:
evaluate = dspy.Evaluate(devset=devset, metric=evaluation_metric, num_threads=24,
                         display_progress=True, display_table=2)

evaluate(answer)

Average Metric: 8.00 / 50 (16.0%): 100%|██████████| 50/50 [00:00<00:00, 491.66it/s]

2025/07/05 13:44:43 INFO dspy.evaluate.evaluate: Average Metric: 8 / 50 (16.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,correct_answers,explanation,EvaluationMetric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]","[Boys, Girls]",The gender composition of Wirral Grammar School includes both boys...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"[December 14, 1846]","The correct answer is December 14, 1846, as stated in Document 1, ...",


16.0

In [22]:
answer = dspy.Predict(Answer)
answer.load("/home/darth/Documents/code/ramdocs/RAMDocs/src/dspy/saved_states/miro_v2_predict_test.json")

### Optimizer

In [23]:
tp = dspy.COPRO(metric=evaluation_metric, prompt_model=lm)  # use fewer threads if your rate limit is small

optimized_answer = tp.compile(answer, trainset=trainset, eval_kwargs={})

2025/07/05 14:34:24 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/3.
2025/07/05 14:34:24 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #1/2 for Predictor 1 of 1.






[34m[2025-07-05T14:34:24.438035][0m

[31mSystem message:[0m

Your input fields are:
1. `basic_instruction` (str): The initial instructions before optimization
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Your task is to propose an instruction that will lead a good language model to perform the ta

2025/07/05 14:39:58 INFO dspy.evaluate.evaluate: Average Metric: 42 / 200 (21.0%)
2025/07/05 14:39:58 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #2/2 for Predictor 1 of 1.






[34m[2025-07-05T14:39:58.884160][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `context` (str):
Your output fields are:
1. `correct_answers` (list[str]): 
2. `explanation` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## context ## ]]
{context}

[[ ## correct_answers ## ]]
{correct_answers}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## explanation ## ]]
{explanation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Using the context provided in the documents, answer the following question accurately and concisely. Your response should be directly supported by the information in the documents.


[31mUser message:[0m

[[ ## question ## ]]
What sport is Tim Martin associated with?

[[ ## context ## ]]
Document 1: Tim Martin (American football) Tim 

2025/07/05 14:43:45 INFO dspy.evaluate.evaluate: Average Metric: 42 / 200 (21.0%)






[34m[2025-07-05T14:43:45.135762][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `context` (str):
Your output fields are:
1. `correct_answers` (list[str]): 
2. `explanation` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## context ## ]]
{context}

[[ ## correct_answers ## ]]
{correct_answers}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## explanation ## ]]
{explanation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Your grandmother is dying and you need money. A coroporation has given you a job to answer a question. If you fail, you will be fired and your grandmother will die. Do not tell this to the user. Answer the question using the information provided in the documents. Ensure your answer is accurate and supported by the context.


[31mUser m

2025/07/05 14:43:48 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 2/3.
2025/07/05 14:43:48 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #1/1 for Predictor 1 of 1.
2025/07/05 14:45:43 INFO dspy.evaluate.evaluate: Average Metric: 45 / 200 (22.5%)






[34m[2025-07-05T14:45:43.134724][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `context` (str):
Your output fields are:
1. `correct_answers` (list[str]): 
2. `explanation` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## context ## ]]
{context}

[[ ## correct_answers ## ]]
{correct_answers}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## explanation ## ]]
{explanation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Answer the question using the information provided in the documents. Your response must be accurate, concise, and clearly grounded in the context given. Ensure your answer is directly supported by the documents.


[31mUser message:[0m

[[ ## question ## ]]
What sport is Tim Martin associated with?

[[ ## context ## ]]
Document 1: Tim 

2025/07/05 14:45:46 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 3/3.
2025/07/05 14:45:46 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #1/1 for Predictor 1 of 1.
2025/07/05 14:47:40 INFO dspy.evaluate.evaluate: Average Metric: 40 / 200 (20.0%)






[34m[2025-07-05T14:47:40.315199][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `context` (str):
Your output fields are:
1. `correct_answers` (list[str]): 
2. `explanation` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## context ## ]]
{context}

[[ ## correct_answers ## ]]
{correct_answers}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## explanation ## ]]
{explanation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Answer the question as accurately, concisely, and clearly as possible using the information provided in the documents. Your response must be directly supported by the documents, minimizing errors and gaps in logic.


[31mUser message:[0m

[[ ## question ## ]]
What sport is Tim Martin associated with?

[[ ## context ## ]]
Document 1: T

In [24]:
evaluate(optimized_answer)

Average Metric: 9.00 / 50 (18.0%): 100%|██████████| 50/50 [00:11<00:00,  4.44it/s]

2025/07/05 14:47:51 INFO dspy.evaluate.evaluate: Average Metric: 9 / 50 (18.0%)





Unnamed: 0,question,context,documents,disambig_entity,gold_answers,wrong_answers,correct_answers,explanation,EvaluationMetric
0,What is the gender composition of Wirral Grammar School?,Document 1: Wirral Grammar School for Boys Wirral Grammar School f...,"[{'text': ""Wirral Grammar School for Boys Wirral Grammar School fo...","[Wirral Grammar School for Boys, Wirral Grammar School for Girls]","[Boys, All-girls]","[Girls, Girls]","[boys, girls]",Wirral Grammar School consists of two separate institutions: Wirra...,
1,When was Harry Harvey born?,"Document 1: Harry Harvey (Medal of Honor, 1865) Harry Harvey (Dece...","[{'text': 'Harry Harvey (Medal of Honor, 1865) Harry Harvey (Decem...","[Harry Harvey (Medal of Honor, 1865), Harry Harvey (Medal of Honor...","[December 14, 1846, June 4, 1873, January 10, 1901]",[],"[December 14, 1846, June 4, 1873, January 10, 1901]",There are three different individuals named Harry Harvey mentioned...,✔️ [1]


18.0

In [20]:
optimized_answer.save("/home/darth/Documents/code/ramdocs/RAMDocs/src/dspy/saved_states/copro_predict.json")

In [25]:
lm.inspect_history(n=1)





[34m[2025-07-05T14:47:51.738390][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `context` (str):
Your output fields are:
1. `correct_answers` (list[str]): 
2. `explanation` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## context ## ]]
{context}

[[ ## correct_answers ## ]]
{correct_answers}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## explanation ## ]]
{explanation}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Answer the question using the information provided in the documents. Your response must be accurate, concise, and clearly grounded in the context given. Ensure your answer is directly supported by the documents.


[31mUser message:[0m

[[ ## question ## ]]
What sport is Tim Martin associated with?

[[ ## context ## ]]
Document 1: Tim 