In [1]:
import pandas as pd
import dspy
from sklearn.model_selection import train_test_split
from dspy.evaluate import SemanticF1
import os

In [2]:
from utils.utils import load_data
from eval.evaluation import run_evaluation

## configure DSPy

In [3]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [4]:
lm = dspy.LM('openai/gpt-4o')
dspy.configure(lm=lm)

## Load some podcast data

In [5]:
df = load_data(truncate_transcripts=True)

In [6]:
len(df)

80

In [7]:
df.head(2)

Unnamed: 0,post_url,post_title,series_number,blog_date,blog_title,file_name,has_transcript,transcript,question,human_answer
0,https://www.acquired.fm/episodes/costco,Costco,"Season 13, Episode 2","August 20, 2023",The Complete History & Strategy of Costco,costco,True,Transcript: (disclaimer: may contain uninten...,How many hot dogs does Costco currently sell p...,130 million
1,https://www.acquired.fm/episodes/costco,Costco,"Season 13, Episode 2","August 20, 2023",The Complete History & Strategy of Costco,costco,True,Transcript: (disclaimer: may contain uninten...,"What store was created as ""the price club of h...",Home Depot


In [8]:
data = []
for idx, row in df.iterrows():
    example = dspy.Example(question=row['question'], 
                           response=row['human_answer'], 
                           context=row['transcript']).with_inputs('question', 'context')
    data.append(example)

In [9]:
data[0].keys()

['question', 'response', 'context']

In [10]:
train, val_ = train_test_split(data, test_size = 0.8, random_state=42)
val, test = train_test_split(data, test_size = 0.5, random_state=42)

In [11]:
len(train), len(val), len(test)

(16, 40, 40)

## Basic question answering without RAG 

In [12]:
cot = dspy.ChainOfThought('question -> response')

In [13]:
print(val[14]['question']) # a question that's hard to answer without source

What was Peloton original price


In [14]:
pred = cot(**val[14].inputs())
print(pred)

Prediction(
    reasoning='To determine the original price of Peloton, we need to consider the context in which "original price" is being asked. This could refer to the initial public offering (IPO) price of Peloton\'s stock when it first became publicly traded, or it could refer to the original price of Peloton\'s flagship product, the Peloton Bike, when it was first released to the market. The IPO price is a specific financial event, while the product price is related to consumer sales.',
    response="Peloton's initial public offering (IPO) price was $29 per share when it went public on September 26, 2019. If you are referring to the original price of the Peloton Bike, it was initially priced at around $2,245 when it was first introduced to the market."
)


### evaluation using semantic F1

In [15]:
metric = SemanticF1(decompositional=True)

In [16]:
run_evaluation(val, metric, cot)

Average Metric: 9.65 / 40 (24.1%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 452.80it/s]

2025/03/26 21:23:31 INFO dspy.evaluate.evaluate: Average Metric: 9.650937950937951 / 40 (24.1%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,"During high school year, which online security community Jan Koum ...",w00w00,"Transcript: (disclaimer: may contain unintentionally confusing, in...","Jan Koum, the co-founder of WhatsApp, was involved in the online s...","During his high school years, Jan Koum was part of the online secu...",✔️ [1.000]
1,What is the difference between recurrent neural networks and convo...,The state of the art had evolved and RNN is sequential and has ver...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Recurrent Neural Networks (RNNs) and Convolutional Neural Networks...,Recurrent Neural Networks (RNNs) and Convolutional Neural Networks...,✔️ [0.364]
2,What was the patent Heddy developed? Was it publicly available?,"Hedy teams up with her new Hollywood neighbor, a music composer na...","Transcript: (disclaimer: may contain unintentionally confusing, in...","Hedy Lamarr, an actress and inventor, co-developed a patent for a ...",Hedy Lamarr developed a patent for a frequency-hopping spread spec...,✔️ [0.500]
3,Which was one of the first free email services?,Juno was one of the first free email services.,"Transcript: (disclaimer: may contain unintentionally confusing, in...","In the mid-1990s, the internet was becoming more accessible to the...","One of the first free email services was Hotmail, which was launch...",
4,Visa is the global sponsor of what?,Visa is the global sponsor of The Olympics,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Visa is a major global payments technology company that often spon...,Visa is the global sponsor of the Olympic Games and the FIFA World...,✔️ [0.667]


24.13

## RAG using the transcript

In [17]:
rag = dspy.ChainOfThought('context, question -> response')

In [18]:
pred = rag(**val[14].inputs())
print(pred)

Prediction(
    reasoning="Peloton initially priced their bike at $1200 when they first tried to start selling it. However, they realized that at this price point, customers perceived the bike as lower quality. To address this perception and position the bike as a premium product, they raised the price to $2245. This price increase helped signal the bike's value and quality, aligning with the luxury and aspirational brand image they wanted to create.",
    response="Peloton's original price for their bike was $1200."
)


In [19]:
run_evaluation(val, metric, rag)

Average Metric: 21.15 / 40 (52.9%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 257.86it/s]

2025/03/26 21:23:51 INFO dspy.evaluate.evaluate: Average Metric: 21.14838991417939 / 40 (52.9%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,"During high school year, which online security community Jan Koum ...",w00w00,"Transcript: (disclaimer: may contain unintentionally confusing, in...","In the transcript, it is mentioned that during his high school yea...","During his high school years, Jan Koum was part of an online hacke...",✔️ [1.000]
1,What is the difference between recurrent neural networks and convo...,The state of the art had evolved and RNN is sequential and has ver...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Recurrent Neural Networks (RNNs) and Convolutional Neural Networks...,Recurrent Neural Networks (RNNs) are designed to handle sequential...,✔️ [0.286]
2,What was the patent Heddy developed? Was it publicly available?,"Hedy teams up with her new Hollywood neighbor, a music composer na...","Transcript: (disclaimer: may contain unintentionally confusing, in...","Hedy Lamarr, along with George Antheil, developed a patent for a f...",Hedy Lamarr developed a patent for a frequency hopping spread spec...,✔️ [0.667]
3,Which was one of the first free email services?,Juno was one of the first free email services.,"Transcript: (disclaimer: may contain unintentionally confusing, in...","In the context provided, it is mentioned that D. E. Shaw, the firm...",Juno was one of the first free email services.,✔️ [1.000]
4,Visa is the global sponsor of what?,Visa is the global sponsor of The Olympics,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Visa is a major global company with significant involvement in int...,"Visa is the global sponsor of the Olympic Games and FIFA, includin...",✔️ [0.667]


52.87

## Optimize the RAG using mipro v2

In [20]:
class RAG(dspy.Module):
    def __init__(self):
        self.rag = dspy.ChainOfThought('context, question -> response')

    def forward(self, question, context):
        return self.rag(question=question, context=context)

In [21]:
rag_pipeline = RAG()

In [22]:
rag_pipeline(val[14]['question'], val[14]['context'])

Prediction(
    reasoning="Peloton initially priced their bike at $1200 when they first tried to start selling it. However, they realized that at this price point, customers perceived the bike as lower quality. To address this perception and position the bike as a premium product, they raised the price to $2245. This price increase helped signal the bike's value and quality, aligning with the luxury and aspirational brand image they wanted to create.",
    response="Peloton's original price for their bike was $1200."
)

In [23]:
optimizer = dspy.MIPROv2(metric=metric, auto="light", num_threads=12)  # use fewer threads if your rate limit is small

optimized_rag_pipeline = optimizer.compile(RAG(), trainset=train,
                           max_bootstrapped_demos=4, max_labeled_demos=4,
                           requires_permission_to_run=False)

2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 12

2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.10it/s]


Bootstrapped 4 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/5


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 254.81it/s]


Bootstrapped 4 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/5


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 383.95it/s]
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `context`, `question`, produce the fields `response`.

2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are provided with a context and a question. Use the context to derive a detailed rea

Bootstrapped 4 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 7.01 / 12 (58.4%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 4316.61it/s]

2025/03/26 21:24:24 INFO dspy.evaluate.evaluate: Average Metric: 7.007142857142857 / 12 (58.4%)
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 58.39

2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====



Average Metric: 9.13 / 12 (76.1%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 477.25it/s]

2025/03/26 21:24:24 INFO dspy.evaluate.evaluate: Average Metric: 9.133333333333333 / 12 (76.1%)
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 76.11
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.11 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [58.39, 76.11]
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 76.11


2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====



Average Metric: 8.68 / 12 (72.4%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 486.97it/s]

2025/03/26 21:24:24 INFO dspy.evaluate.evaluate: Average Metric: 8.683549783549784 / 12 (72.4%)
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.36 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [58.39, 76.11, 72.36]
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 76.11


2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====



Average Metric: 8.33 / 12 (69.4%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 470.46it/s]

2025/03/26 21:24:24 INFO dspy.evaluate.evaluate: Average Metric: 8.328205128205129 / 12 (69.4%)
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.4 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [58.39, 76.11, 72.36, 69.4]
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 76.11


2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====



Average Metric: 8.68 / 12 (72.4%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 4175.17it/s]

2025/03/26 21:24:24 INFO dspy.evaluate.evaluate: Average Metric: 8.683549783549784 / 12 (72.4%)
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.36 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].





2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [58.39, 76.11, 72.36, 69.4, 72.36]
2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 76.11


2025/03/26 21:24:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====


Average Metric: 6.51 / 12 (54.3%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 376.87it/s]

2025/03/26 21:24:25 INFO dspy.evaluate.evaluate: Average Metric: 6.512121212121212 / 12 (54.3%)
2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.27 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [58.39, 76.11, 72.36, 69.4, 72.36, 54.27]
2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 76.11


2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====



Average Metric: 8.98 / 12 (74.9%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 487.45it/s]

2025/03/26 21:24:25 INFO dspy.evaluate.evaluate: Average Metric: 8.983549783549783 / 12 (74.9%)
2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.86 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [58.39, 76.11, 72.36, 69.4, 72.36, 54.27, 74.86]
2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 76.11


2025/03/26 21:24:25 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 76.11!





In [24]:
optimized_rag_pipeline

rag.predict = Predict(StringSignature(context, question -> reasoning, response
    instructions='You are provided with a context and a question. Use the context to derive a detailed reasoning process and generate a coherent response to the question. Consider the specific details and relationships within the context to ensure an accurate and comprehensive answer.'
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    response = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Response:', 'desc': '${response}'})
))

In [25]:
optimized_rag_pipeline.save('optimized_pipeline.json')

In [26]:
run_evaluation(val, metric, optimized_rag_pipeline)

Average Metric: 25.80 / 40 (64.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 262.88it/s]

2025/03/26 21:24:51 INFO dspy.evaluate.evaluate: Average Metric: 25.7952380952381 / 40 (64.5%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,"During high school year, which online security community Jan Koum ...",w00w00,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Not supplied for this particular example.,w00w00,✔️ [1.000]
1,What is the difference between recurrent neural networks and convo...,The state of the art had evolved and RNN is sequential and has ver...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Recurrent Neural Networks (RNNs) and Convolutional Neural Networks...,Recurrent Neural Networks (RNNs) are designed for sequential data ...,✔️ [0.250]
2,What was the patent Heddy developed? Was it publicly available?,"Hedy teams up with her new Hollywood neighbor, a music composer na...","Transcript: (disclaimer: may contain unintentionally confusing, in...",Not supplied for this particular example.,The patent Hedy Lamarr developed was for a frequency hopping techn...,✔️ [0.571]
3,Which was one of the first free email services?,Juno was one of the first free email services.,"Transcript: (disclaimer: may contain unintentionally confusing, in...","The context mentions that within D. E. Shaw, Jeff Bezos and David ...",Juno,✔️ [1.000]
4,Visa is the global sponsor of what?,Visa is the global sponsor of The Olympics,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Not supplied for this particular example.,The Olympics,✔️ [1.000]


64.49

## Can we summarize the transcript to avoid token limit?

In [50]:
df = load_data(truncate_transcripts=False)

In [51]:
class Summarize(dspy.Signature):
    """Summarize the long-form text input."""
    text: str = dspy.InputField()
    summary: str = dspy.OutputField(desc="Summary of the long-form text with all key points")

In [52]:
class SummaryRAG(dspy.Module):
    def __init__(self):
        self.summarize = dspy.ChainOfThought(Summarize)
        self.rag = dspy.ChainOfThought('context, question -> response')

    def forward(self, question, context):
        summary = self.summarize(text=context)
        return self.rag(question=question, context=summary)

In [53]:
summary_rag_pipeline = SummaryRAG()

In [54]:
summary_rag_pipeline(val[14]['question'], val[14]['context'])

Prediction(
    reasoning="The context provided is a summary of a podcast episode discussing Peloton's business strategy and leadership changes. However, it does not specifically mention the original price of Peloton's products or services. To answer the question about Peloton's original price, additional information or research would be needed, as the context does not provide this detail.",
    response="The context does not specify Peloton's original price. Typically, Peloton's original bike was priced around $2,245 when it first launched, but for precise historical pricing, further research or specific sources would be required."
)

In [55]:
run_evaluation(val, metric, summary_rag_pipeline)

Average Metric: 11.43 / 40 (28.6%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [01:24<00:00,  2.12s/it]

2025/03/26 21:56:18 INFO dspy.evaluate.evaluate: Average Metric: 11.434920634920635 / 40 (28.6%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,"During high school year, which online security community Jan Koum ...",w00w00,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The context provided does not explicitly mention the specific onli...,"Jan Koum was part of the online security community ""w00w00"" during...",✔️ [1.000]
1,What is the difference between recurrent neural networks and convo...,The state of the art had evolved and RNN is sequential and has ver...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Recurrent Neural Networks (RNNs) and Convolutional Neural Networks...,Recurrent Neural Networks (RNNs) and Convolutional Neural Networks...,✔️ [0.286]
2,What was the patent Heddy developed? Was it publicly available?,"Hedy teams up with her new Hollywood neighbor, a music composer na...","Transcript: (disclaimer: may contain unintentionally confusing, in...","Hedy Lamarr, along with composer George Antheil, developed a paten...",Hedy Lamarr developed a patent for frequency-hopping spread spectr...,✔️ [0.667]
3,Which was one of the first free email services?,Juno was one of the first free email services.,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The context provided does not directly address the question about ...,"One of the first free email services was Hotmail, which was launch...",
4,Visa is the global sponsor of what?,Visa is the global sponsor of The Olympics,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The context provided does not directly mention Visa's current spon...,Visa is the global sponsor of major events such as the Olympic Gam...,✔️ [0.667]


28.59

### Optimize this pipeline 

In [56]:
optimized_summary_rag_pipeline = optimizer.compile(SummaryRAG(), trainset=train,
                           max_bootstrapped_demos=4, max_labeled_demos=4,
                           requires_permission_to_run=False)

2025/03/26 21:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 3
valset size: 12

2025/03/26 21:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/03/26 21:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/03/26 21:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=3 sets of demonstrations...


Bootstrapping set 1/3
Bootstrapping set 2/3
Bootstrapping set 3/3


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:19<00:00,  4.79s/it]
2025/03/26 21:56:38 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/03/26 21:56:38 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/03/26 21:56:38 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...



Bootstrapped 0 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.


2025/03/26 21:57:30 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/03/26 21:57:30 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Summarize the long-form text input.

2025/03/26 21:57:30 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Imagine you are a journalist tasked with providing a concise and accurate summary of a complex news article for the evening bulletin. Your goal is to distill the long-form text provided into a brief yet comprehensive summary that captures all the critical points and insights, ensuring that the audience receives a clear understanding of the topic. Approach this task with meticulous attention to detail and logical reasoning, as your summary will be the key source of information for viewers who rely on your expertise to convey the essence of the story.

2025/03/26 21:57:30 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are an expert summarizer tasked with transforming long-form texts into concise summaries. Your goal is to ext

Average Metric: 5.13 / 12 (42.8%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 3737.13it/s]

2025/03/26 21:57:30 INFO dspy.evaluate.evaluate: Average Metric: 5.133333333333334 / 12 (42.8%)
2025/03/26 21:57:30 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 42.78

2025/03/26 21:57:30 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====



Average Metric: 5.57 / 12 (46.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:13<00:00,  1.14s/it]

2025/03/26 21:57:43 INFO dspy.evaluate.evaluate: Average Metric: 5.566666666666666 / 12 (46.4%)
2025/03/26 21:57:43 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 46.39
2025/03/26 21:57:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 46.39 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 2'].
2025/03/26 21:57:43 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [42.78, 46.39]
2025/03/26 21:57:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 46.39


2025/03/26 21:57:43 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====



Average Metric: 7.05 / 12 (58.7%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:15<00:00,  1.28s/it]

2025/03/26 21:57:59 INFO dspy.evaluate.evaluate: Average Metric: 7.0499061913696055 / 12 (58.7%)
2025/03/26 21:57:59 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 58.75
2025/03/26 21:57:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.75 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 21:57:59 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [42.78, 46.39, 58.75]
2025/03/26 21:57:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.75


2025/03/26 21:57:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====



Average Metric: 6.90 / 12 (57.5%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:18<00:00,  1.52s/it]

2025/03/26 21:58:17 INFO dspy.evaluate.evaluate: Average Metric: 6.9 / 12 (57.5%)
2025/03/26 21:58:17 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.5 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/03/26 21:58:17 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [42.78, 46.39, 58.75, 57.5]
2025/03/26 21:58:17 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.75


2025/03/26 21:58:17 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====



Average Metric: 7.07 / 12 (58.9%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:10<00:00,  1.10it/s]

2025/03/26 21:58:28 INFO dspy.evaluate.evaluate: Average Metric: 7.071428571428571 / 12 (58.9%)
2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 58.93
2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.93 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [42.78, 46.39, 58.75, 57.5, 58.93]
2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.93


2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====



Average Metric: 7.07 / 12 (58.9%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 4678.53it/s]

2025/03/26 21:58:28 INFO dspy.evaluate.evaluate: Average Metric: 7.071428571428571 / 12 (58.9%)
2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.93 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [42.78, 46.39, 58.75, 57.5, 58.93, 58.93]
2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 58.93


2025/03/26 21:58:28 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====



Average Metric: 7.11 / 12 (59.2%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:07<00:00,  1.52it/s]

2025/03/26 21:58:36 INFO dspy.evaluate.evaluate: Average Metric: 7.108093237294917 / 12 (59.2%)
2025/03/26 21:58:36 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 59.23
2025/03/26 21:58:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.23 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 1'].
2025/03/26 21:58:36 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [42.78, 46.39, 58.75, 57.5, 58.93, 58.93, 59.23]
2025/03/26 21:58:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 59.23


2025/03/26 21:58:36 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 59.23!





In [57]:
run_evaluation(val, metric, optimized_summary_rag_pipeline)

Average Metric: 19.54 / 40 (48.8%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [01:09<00:00,  1.73s/it]

2025/03/26 21:59:45 INFO dspy.evaluate.evaluate: Average Metric: 19.53649758148874 / 40 (48.8%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,"During high school year, which online security community Jan Koum ...",w00w00,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The context provided is a summary of a podcast episode discussing ...,w00w00,✔️ [1.000]
1,What is the difference between recurrent neural networks and convo...,The state of the art had evolved and RNN is sequential and has ver...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Recurrent Neural Networks (RNNs) and Convolutional Neural Networks...,Recurrent Neural Networks (RNNs) are designed for sequential data ...,✔️ [0.286]
2,What was the patent Heddy developed? Was it publicly available?,"Hedy teams up with her new Hollywood neighbor, a music composer na...","Transcript: (disclaimer: may contain unintentionally confusing, in...","The patent developed by Hedy Lamarr, along with George Antheil, wa...",The patent developed by Hedy Lamarr was for frequency-hopping spre...,✔️ [0.444]
3,Which was one of the first free email services?,Juno was one of the first free email services.,"Transcript: (disclaimer: may contain unintentionally confusing, in...","The context mentions that within D. E. Shaw, Jeff Bezos and David ...",Juno,✔️ [1.000]
4,Visa is the global sponsor of what?,Visa is the global sponsor of The Olympics,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The context provided is a summary of a podcast episode discussing ...,Olympic Games and FIFA World Cup,✔️ [0.667]


48.84

## eval on test

In [27]:
run_evaluation(test, metric, rag)

Average Metric: 16.71 / 40 (41.8%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 291.75it/s]

2025/03/26 21:25:08 INFO dspy.evaluate.evaluate: Average Metric: 16.710980063451217 / 40 (41.8%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,What was a key strength of Hiroshi Yamauchi? This is something tha...,Hiroshi Yamauchi was very good at spotting games that consumers wo...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Hiroshi Yamauchi demonstrated a remarkable ability to identify and...,A key strength of Hiroshi Yamauchi was his exceptional ability to ...,✔️ [0.400]
1,How many hot dogs does Costco currently sell per year?,130 million,"Transcript: (disclaimer: may contain unintentionally confusing, in...","In the provided context, it is mentioned that Costco sells 130 mil...",Costco currently sells 130 million hot dogs per year.,✔️ [1.000]
2,What historic event took investigative pressure off of Enron befor...,September 11th,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Enron's financial practices and potential fraudulent activities we...,"The September 11, 2001 terrorist attacks took investigative pressu...",✔️ [0.500]
3,How is Jim Weber's strategic focus influence the revitalization of...,His leadership is pivotal in transforming Brooks Running from a st...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Jim Weber's strategic focus played a crucial role in revitalizing ...,Jim Weber's strategic focus on performance running was instrumenta...,✔️ [0.545]
4,Why were patents developed by Heddy had 88 frequency hops ?,There were 88 frequency hops in their technical description of the...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The patents developed by Hedy Lamarr and George Antheil for freque...,The patents developed by Hedy Lamarr and George Antheil had 88 fre...,✔️ [0.667]


41.78

In [28]:
run_evaluation(test, metric, optimized_rag_pipeline)

Average Metric: 19.79 / 40 (49.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 216.66it/s]

2025/03/26 21:25:20 INFO dspy.evaluate.evaluate: Average Metric: 19.786147186147186 / 40 (49.5%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,What was a key strength of Hiroshi Yamauchi? This is something tha...,Hiroshi Yamauchi was very good at spotting games that consumers wo...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Not supplied for this particular example.,Hiroshi Yamauchi had an incredible eye for identifying fun and suc...,✔️ [0.500]
1,How many hot dogs does Costco currently sell per year?,130 million,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The context provides information about Costco's operations and som...,130 million,✔️ [1.000]
2,What historic event took investigative pressure off of Enron befor...,September 11th,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Not supplied for this particular example.,The September 11 attacks,✔️ [1.000]
3,How is Jim Weber's strategic focus influence the revitalization of...,His leadership is pivotal in transforming Brooks Running from a st...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Jim Weber's strategic focus on revitalizing Brooks Running involve...,Jim Weber's strategic focus on serving active runners exclusively ...,✔️ [0.364]
4,Why were patents developed by Heddy had 88 frequency hops ?,There were 88 frequency hops in their technical description of the...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The patents developed by Hedy Lamarr and George Antheil for freque...,The patents had 88 frequency hops because they were based on the 8...,✔️ [0.333]


49.47

In [395]:
evaluate(optimized_rag_pipeline)

Average Metric: 19.79 / 40 (49.5%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [01:01<00:00,  1.53s/it]

2025/03/25 21:06:11 INFO dspy.evaluate.evaluate: Average Metric: 19.786147186147186 / 40 (49.5%)





Unnamed: 0,question,example_response,context,reasoning,pred_response,SemanticF1
0,What was a key strength of Hiroshi Yamauchi? This is something tha...,Hiroshi Yamauchi was very good at spotting games that consumers wo...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Not supplied for this particular example.,Hiroshi Yamauchi had an incredible eye for identifying fun and suc...,✔️ [0.500]
1,How many hot dogs does Costco currently sell per year?,130 million,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The context provides information about Costco's operations and som...,130 million,✔️ [1.000]
2,What historic event took investigative pressure off of Enron befor...,September 11th,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Not supplied for this particular example.,The September 11 attacks,✔️ [1.000]
3,How is Jim Weber's strategic focus influence the revitalization of...,His leadership is pivotal in transforming Brooks Running from a st...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",Jim Weber's strategic focus on revitalizing Brooks Running involve...,Jim Weber's strategic focus on serving active runners exclusively ...,✔️ [0.364]
4,Why were patents developed by Heddy had 88 frequency hops ?,There were 88 frequency hops in their technical description of the...,"Transcript: (disclaimer: may contain unintentionally confusing, in...",The patents developed by Hedy Lamarr and George Antheil for freque...,The patents had 88 frequency hops because they were based on the 8...,✔️ [0.333]


49.47

## 