In [16]:
import os
os.chdir('../promptsmith')

### setting up dspy

In [1]:
from promptsmith.dspy_init import get_dspy
dspy, lm = get_dspy()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define a module (ChainOfThought) and assign it a signature (return an answer, given a question).
qa = dspy.ChainOfThought('question -> answer')

response = qa(question="How many floors are in the castle David Gregory inherited?")
print(response.answer)

The number of floors in the castle David Gregory inherited is not specified in the information provided.


### Access the last call to the LLM, with all metadata

In [None]:
len(lm.history)  # e.g., 3 calls to the LM

In [4]:
lm.history[-1].keys()

dict_keys(['prompt', 'messages', 'kwargs', 'response', 'outputs', 'usage', 'cost', 'timestamp', 'uuid', 'model', 'response_model', 'model_type'])

In [5]:
import pprint
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(lm.history[-1])

{ 'cost': 8.235e-05,
  'kwargs': {},
  'messages': [ { 'content': 'Your input fields are:\n'
                             '1. `question` (str)\n'
                             'Your output fields are:\n'
                             '1. `reasoning` (str)\n'
                             '2. `answer` (str)\n'
                             'All interactions will be structured in the '
                             'following way, with the appropriate values '
                             'filled in.\n'
                             '\n'
                             '[[ ## question ## ]]\n'
                             '{question}\n'
                             '\n'
                             '[[ ## reasoning ## ]]\n'
                             '{reasoning}\n'
                             '\n'
                             '[[ ## answer ## ]]\n'
                             '{answer}\n'
                             '\n'
                             '[[ ## completed ## ]]\n'
               

### Request Multiple Variations

In [6]:
question = "What's something great about the ColBERT retrieval model?"

answer_a_question = dspy.ChainOfThought('question -> answer', n=5)

response = answer_a_question(question=question)

['One great aspect of the ColBERT retrieval model is its ability to efficiently combine dense and sparse retrieval techniques, which enhances retrieval accuracy while significantly speeding up the search process through its late interaction mechanism.',
 'One great thing about the ColBERT retrieval model is its ability to combine the strengths of dense and sparse retrieval techniques while maintaining high efficiency through late interaction, allowing for effective large-scale information retrieval.',
 'One great aspect of the ColBERT retrieval model is its efficiency in handling large document collections through late interaction, allowing for fast and scalable retrieval without sacrificing accuracy.',
 'One great aspect of the ColBERT retrieval model is its efficiency in combining dense and sparse retrieval methods through late interaction, allowing for fast and high-quality document retrieval in large-scale applications.',
 "One great aspect of the ColBERT retrieval model is its abi

In [None]:
response.completions.answer

In [7]:
print(f"Reasoning: {response.reasoning}")
print(f"Answer: {response.answer}")

Reasoning: The ColBERT retrieval model is notable for its efficiency and effectiveness in handling large-scale information retrieval tasks. It utilizes a two-stage process that combines the benefits of dense and sparse representations, allowing it to efficiently search through vast datasets while maintaining high retrieval accuracy. The model employs late interaction mechanisms, which enable it to compute similarities between query and document embeddings without the need to compute a full dot product for every document, thus significantly speeding up the retrieval process. This makes ColBERT particularly well-suited for applications that require real-time search capabilities, such as search engines.
Answer: One great aspect of the ColBERT retrieval model is its ability to efficiently combine dense and sparse retrieval techniques, which enhances retrieval accuracy while significantly speeding up the search process through its late interaction mechanism.


### Check LLM Usage

In [8]:
response.get_lm_usage()


{'openai/gpt-4o-mini': {'completion_tokens': 778,
  'prompt_tokens': 173,
  'total_tokens': 951,
  'completion_tokens_details': {'accepted_prediction_tokens': 0,
   'audio_tokens': 0,
   'reasoning_tokens': 0,
   'rejected_prediction_tokens': 0,
   'text_tokens': None},
  'prompt_tokens_details': {'audio_tokens': 0,
   'cached_tokens': 0,
   'text_tokens': None,
   'image_tokens': None}}}

### Cool Example

In [9]:
feeling_analyzer = dspy.Predict('sentence, situation -> the_actual_feeling_of_the_person_in_the_sentence: str, reasoning: str')

sentence="i went outside after a long time being in a dark room"
situation="it's raining outside"
response = feeling_analyzer(sentence=sentence, situation=situation)

print(response.the_actual_feeling_of_the_person_in_the_sentence)
print(response.reasoning)

a mix of relief and disappointment
The person likely feels relief from finally being outside after being in a dark room for a long time, indicating a desire for fresh air and light. However, the disappointment comes from the fact that it is raining outside, which may dampen their experience and prevent them from fully enjoying being outdoors.


### Using a Judge

In [10]:
from promptsmith.judges.judge_meaning import JudgeMeaning

judge_meaning = dspy.Predict(JudgeMeaning)

input_text = (
    "Yesterday, I went to the grocery store to buy ingredients for dinner. "
    "I ended up buying fruits, vegetables, and pasta. When I got home, I realized I forgot the cheese."
)

output_text = (
    "I went shopping yesterday to get food. I bought some fruits, vegetables, and pasta, "
    "but forgot to buy cheese."
)

result = judge_meaning(input_text=input_text, output_text=output_text)

print("Reasoning:", result.reasoning)
print("Score:", result.score)

Reasoning: The restructured text preserves the essential meaning of the original input. It maintains the key ideas of going shopping, the items purchased (fruits, vegetables, and pasta), and the fact that cheese was forgotten. The phrase "to get food" is a slight generalization but does not distort the overall meaning. The sequence of events is also preserved, with the mention of shopping yesterday and the realization of forgetting cheese. Overall, the changes are acceptable, and the meaning is well-preserved.
Score: 1.0


### Testing Ensemble Judge Evaluation

In [11]:
def display_verdict(verdict):

    print("\n📊 Evaluation Results:")
    print("----------------------")
    
    store = verdict._store

    # Find all score, reasoning, and weight fields
    score_fields = [k for k in store if k.endswith('_score') and k != 'combined_score']
    reasoning_fields = [k for k in store if k.endswith('_reasoning')]
    weight_fields = {k.replace('_weight', ''): store[k] for k in store if k.endswith('_weight')}

    # Display overall score
    overall = store.get('combined_score')
    if overall is None and score_fields:
        # Fallback: average of all scores
        overall = sum(store[k] for k in score_fields) / len(score_fields)
    print(f"\n🌟 Overall Score: {overall:.3f}\n")

    # For each judge, display name, score, weight, and reasoning
    # Sort for consistent order
    for field in sorted(score_fields):
        judge_key = field.replace('_score', '')
        judge_name = judge_key.replace('_', ' ').title()
        reasoning_field = field.replace('_score', '_reasoning')
        score = store[field]
        reasoning = store.get(reasoning_field, "")
        weight = weight_fields.get(judge_key, None)
        if weight is not None:
            print(f"### {judge_name} Analysis (score={score:.2f}, weight={weight})")
        else:
            print(f"### {judge_name} Analysis (score={score:.2f})")
        print(reasoning)
        print()  # Blank line between judges

### the original text and the restructured version

In [12]:
from promptsmith.tasks.restructure_text import RestructureText

text_to_restructure = (
    "I was trying to fix the kitchen sink. At first, I thought it was a clog, but it turned out to be a broken pipe. "
    "Water was everywhere, and I had no tools. I called my friend who had some plumbing experience, and he came over. "
    "Together we shut off the water and replaced the pipe, which took us the entire afternoon."
)

restructure = dspy.ChainOfThought(RestructureText)
restructured_text = restructure(input_text=text_to_restructure)


print("\n📝 Original Text:")
print("----------------------")
print(text_to_restructure)
print("----------------------")

print("\n📘 Restructured Text:")
print("----------------------")
print(restructured_text.output_text)
print("----------------------")

print("\n🧠 Reasoning:")
print(restructured_text.reasoning)

print("\n🤖 DSPy History:")
print(dspy.inspect_history(n=1))


📝 Original Text:
----------------------
I was trying to fix the kitchen sink. At first, I thought it was a clog, but it turned out to be a broken pipe. Water was everywhere, and I had no tools. I called my friend who had some plumbing experience, and he came over. Together we shut off the water and replaced the pipe, which took us the entire afternoon.
----------------------

📘 Restructured Text:
----------------------
### Fixing the Kitchen Sink: A Plumbing Adventure

Recently, I faced a challenge while trying to fix my kitchen sink. Initially, I suspected that a clog was the issue, but I soon discovered that a broken pipe was the real problem.

Water was leaking everywhere, and unfortunately, I didn't have any tools on hand to address the situation. In need of assistance, I called a friend who had some plumbing experience. 

He quickly came over to help. Together, we managed to shut off the water supply and replace the broken pipe. The entire process took us the whole afternoon, but

### evaluating the restructured text

In [13]:
from promptsmith.judges.ensemble_judge import EnsembleJudge
import os

judge_path = os.path.abspath("../promptsmith/judges/judge_restructure_text.yaml")

judge = EnsembleJudge(judge_path)
verdict = judge(input_text=text_to_restructure, output_text=restructured_text.output_text)

In [14]:
display_verdict(verdict)


📊 Evaluation Results:
----------------------

🌟 Overall Score: 0.990

### Focus Relevance Analysis (score=1.00, weight=0.25)
The restructured text remains focused on the original message about fixing the kitchen sink. It accurately captures the sequence of events, from suspecting a clog to discovering a broken pipe, and the subsequent actions taken to resolve the issue. The added title and slight rephrasing do not detract from the main ideas, and all content is relevant to the plumbing situation described. There are no off-topic sentences or unnecessary filler, maintaining a clear narrative throughout.

### Meaning Analysis (score=1.00, weight=0.25)
The restructured text maintains the essential meaning of the original input. Key ideas such as the initial assumption of a clog, the discovery of a broken pipe, the water leak, the lack of tools, and the involvement of a friend with plumbing experience are all preserved. The sequence of events is also retained, including the shutting off o

#### showing few "meaning" golden set examples

In [17]:
from promptsmith.golden.load import load_golden_set

# Load the meaning golden set
dev_set, test_set = load_golden_set(judge_name="meaning")

print(f"Loaded {len(dev_set)} dev examples and {len(test_set)} test examples")
def print_examples(examples, n=1):
    for i in range(min(n, len(examples))):
        print(f"\nExample {i+1}:")
        print(f"Input: {examples[i].input_text}")
        print(f"Output: {examples[i].output_text}")
        print(f"Gold score: {examples[i].gold_score}")
        print(f"Gold reasoning: {examples[i].gold_reasoning}")

print_examples(dev_set, n=3)


Current working directory: /Users/yanivgal/dev/ai21/promptsmith/promptsmith
Loaded 48 dev examples and 12 test examples

Example 1:
Input: Linda borrowed the book from Mark yesterday.
Output: Mark borrowed the book from Linda yesterday.
Gold score: 0.6
Gold reasoning: Roles reversed; rest identical.

Example 2:
Input: The novel was written by George Orwell in 1949.
Output: The novel was written by J. K. Rowling in 1949.
Gold score: 0.2
Gold reasoning: Author misattributed; timeframe correct but misleading.

Example 3:
Input: He earned a salary of $75,000 last year.
Output: He earned a salary of $750,000 last year.
Gold score: 0.2
Gold reasoning: Order‑of‑magnitude hallucination.


#### calibrating the judges

In [18]:
import importlib

import promptsmith.judges.judge_meaning
importlib.reload(promptsmith.judges.judge_meaning)

import promptsmith.judges.metrics
importlib.reload(promptsmith.judges.metrics)

from promptsmith.judges.metrics import calibrate_judge, evaluate_judge

from promptsmith.golden.load import load_golden_set


# 1. Load and split the golden set
devset, testset = load_golden_set("meaning")

# 2. Optimise the prompt (auto='heavy' tries more variants)
best_judge = calibrate_judge(devset=devset, judge_class=JudgeMeaning)

# 3. Evaluate on held‑out examples
print("\nHeld‑out accuracy:", evaluate_judge(best_judge, testset))

# 4. Inspect the optimiser’s final prompt
print("\n— Tuned prompt text —\n")
print(best_judge.signature.instructions)


Current working directory: /Users/yanivgal/dev/ai21/promptsmith/promptsmith


2025/05/04 13:36:07 INFO dspy.evaluate.evaluate: Average Metric: 38.35000000000001 / 48 (79.9%)


Baseline dev acc: 79.900


2025/05/04 13:36:08 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/05/04 13:36:08 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/05/04 13:36:08 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=3 sets of demonstrations...


Bootstrapping set 1/3
Bootstrapping set 2/3
Bootstrapping set 3/3


  0%|          | 0/10 [00:00<?, ?it/s]2025/05/04 13:36:11 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'input_text': 'Linda borrowed the book from Mark yesterday.', 'output_text': 'Mark borrowed the book from Linda yesterday.', 'gold_score': 0.6, 'gold_reasoning': 'Roles reversed; rest identical.'}) (input_keys={'input_text', 'output_text'}) with <function score_agreement_metric at 0x1257e4c10> due to getattr(): attribute name must be string.
 10%|█         | 1/10 [00:02<00:20,  2.26s/it]2025/05/04 13:36:13 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'input_text': 'The novel was written by George\xa0Orwell in 1949.', 'output_text': 'The novel was written by J.\u202fK.\xa0Rowling in 1949.', 'gold_score': 0.2, 'gold_reasoning': 'Author misattributed; timeframe correct but misleading.'}) (input_keys={'input_text', 'output_text'}) with <function score_agreement_metric at 0x1257e4c10> due to getattr(): attribute name must 

Average Metric: 31.25 / 38 (82.2%): 100%|██████████| 38/38 [00:13<00:00,  2.78it/s]

2025/05/04 13:37:16 INFO dspy.evaluate.evaluate: Average Metric: 31.25 / 38 (82.2%)
2025/05/04 13:37:16 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 82.24

2025/05/04 13:37:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 7 - Minibatch ==



Average Metric: 27.90 / 35 (79.7%): 100%|██████████| 35/35 [00:12<00:00,  2.88it/s]

2025/05/04 13:37:29 INFO dspy.evaluate.evaluate: Average Metric: 27.900000000000002 / 35 (79.7%)
2025/05/04 13:37:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 79.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/05/04 13:37:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [79.71]
2025/05/04 13:37:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [82.24]
2025/05/04 13:37:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 82.24


2025/05/04 13:37:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 7 - Minibatch ==



Average Metric: 28.95 / 35 (82.7%): 100%|██████████| 35/35 [00:11<00:00,  3.17it/s]

2025/05/04 13:37:40 INFO dspy.evaluate.evaluate: Average Metric: 28.95 / 35 (82.7%)
2025/05/04 13:37:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0'].
2025/05/04 13:37:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [79.71, 82.71]
2025/05/04 13:37:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [82.24]
2025/05/04 13:37:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 82.24


2025/05/04 13:37:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 7 - Minibatch ==



Average Metric: 28.80 / 35 (82.3%): 100%|██████████| 35/35 [00:11<00:00,  3.06it/s]

2025/05/04 13:37:51 INFO dspy.evaluate.evaluate: Average Metric: 28.8 / 35 (82.3%)
2025/05/04 13:37:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1'].
2025/05/04 13:37:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [79.71, 82.71, 82.29]
2025/05/04 13:37:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [82.24]
2025/05/04 13:37:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 82.24


2025/05/04 13:37:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 7 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:11<00:00,  3.08it/s]

2025/05/04 13:38:02 INFO dspy.evaluate.evaluate: Average Metric: 28.000000000000004 / 35 (80.0%)
2025/05/04 13:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2'].
2025/05/04 13:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [79.71, 82.71, 82.29, 80.0]
2025/05/04 13:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [82.24]
2025/05/04 13:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 82.24


2025/05/04 13:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 7 - Minibatch ==



Average Metric: 28.25 / 35 (80.7%): 100%|██████████| 35/35 [00:12<00:00,  2.73it/s]

2025/05/04 13:38:15 INFO dspy.evaluate.evaluate: Average Metric: 28.250000000000007 / 35 (80.7%)
2025/05/04 13:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0'].
2025/05/04 13:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [79.71, 82.71, 82.29, 80.0, 80.71]
2025/05/04 13:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [82.24]
2025/05/04 13:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 82.24


2025/05/04 13:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 - Full Evaluation =====
2025/05/04 13:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 81.71) from minibatch trials...



Average Metric: 31.30 / 38 (82.4%): 100%|██████████| 38/38 [00:13<00:00,  2.83it/s]

2025/05/04 13:38:29 INFO dspy.evaluate.evaluate: Average Metric: 31.3 / 38 (82.4%)
2025/05/04 13:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 82.37
2025/05/04 13:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [82.24, 82.37]
2025/05/04 13:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 82.37
2025/05/04 13:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/05/04 13:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 82.37!





2025/05/04 13:38:44 INFO dspy.evaluate.evaluate: Average Metric: 38.25000000000001 / 48 (79.7%)
2025/05/04 13:38:44 ERROR dspy.utils.parallelizer: Error for Example({'input_text': '"Don\'t forget the keys," Maria reminded John.', 'output_text': 'Maria reminded John that he had already lost the keys.', 'gold_score': 0.4, 'gold_reasoning': 'Adds new information that changes meaning.'}) (input_keys={'input_text', 'output_text'}): 'tuple' object is not callable. Set `provide_traceback=True` for traceback.
2025/05/04 13:38:44 ERROR dspy.utils.parallelizer: Error for Example({'input_text': 'E = mc² describes the equivalence of mass and energy.', 'output_text': 'E = mc² proves mass and time are equivalent.', 'gold_score': 0.2, 'gold_reasoning': 'Core concept distorted.'}) (input_keys={'input_text', 'output_text'}): 'tuple' object is not callable. Set `provide_traceback=True` for traceback.
2025/05/04 13:38:44 ERROR dspy.utils.parallelizer: Error for Example({'input_text': 'The library opens a

Tuned dev acc: 79.690


Exception: Execution cancelled due to errors or interruption.

In [26]:
import dspy
from judges.judge_meaning import JudgeMeaning 

# original (baseline) predictor
baseline = dspy.Predict(JudgeMeaning)

print("— Original prompt —")
print("=" * 30)
print()
print(baseline.signature.instructions)

print("\n— Tuned prompt —")
print("=" * 30)
print()
print(best_judge.signature.instructions)


— Original prompt —

You are an expert in evaluating meaning preservation.

Your task is to determine whether the restructured text preserves the essential meaning of the original input text.

Please consider:
1. Are all key ideas and important details still present?
2. Were any important parts of the original meaning lost or changed significantly?
3. Are minor rephrasings or small generalizations acceptable if they don't distort the overall meaning?

Assign a score between 0 (poor preservation) and 1 (perfect preservation).

Explain your reasoning clearly, mentioning what was preserved well and any important losses or changes.

— Tuned prompt —

You are a language evaluation expert tasked with assessing the quality of sentence transformations. Your job is to determine if the transformed text maintains the essential meaning of the original input sentence. 

When evaluating, please consider the following:
1. Are all key ideas and important details from the original sentence still presen

In [27]:
best_judge.save("best_meaning_judge.json")