In [1]:
import sqlite3
import pandas as pd
import dspy
import dotenv, os
from pydantic import BaseModel, Field
import openai
import pm4py
import ast
from numpy import random
from dspy.evaluate import Evaluate
from collections import defaultdict
import tqdm as notebook_tqdm
import copy
import re
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
import phoenix as px
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from chroma_retriever import Chroma
from chromadb.utils import embedding_functions

In [3]:
dotenv.load_dotenv()
lm = dspy.LM('openai/gpt-4o', temperature=0.3, max_tokens=4096, stop=None, cache=False)
dspy.settings.configure(lm = lm, trace = [])

In [4]:
phoenix_session = px.launch_app()
endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))

trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


# Creating the Judge class

In [5]:
class scoring(BaseModel):
    score: str
class Assess(dspy.Signature):
    """Assess how closely the Answer matches the prediction in relation to the question. Differences in formating are less important than the actual content. Plausible methods to arrive at the answer are not considered. Only consider the Answer and the Prediction + the Question, nothing else is relevant to you."""
    question = dspy.InputField()
    solution = dspy.InputField()
    prediction = dspy.InputField()
    reasoning = dspy.OutputField(desc="Reasoning behind the score")
    answer = dspy.OutputField(desc="0 means its absolutely wrong, 1 means that the prediction answers parts of the question but not all of it, 2 means its an exact match in terms of content")

class LM_EVAL(dspy.Module):
    def __init__(self, gpt4T):
        super().__init__()
        self.gpt4T = gpt4T
        self.reasoning = defaultdict(list)
        self.scorer = dspy.Predict(Assess)
        self.hist = []
    def forward(self, question,example, prediction, trace= None):
        question = question
        example = example
        pred = prediction
        with dspy.context(lm=self.gpt4T):        
            pred = self.scorer(question=question, solution=example, prediction=pred)
        self.reasoning[question].append(pred.reasoning)                
        try:
            numbers = re.findall(r'\d+', pred.answer)
        except:
            pass
        # Check if we found any numbers and take the last one
        if numbers:
            last_number = numbers[-1]    
            # Convert the last found number to an integer and check if it's in the valid range
            last_number_int = int(last_number)
            if last_number_int in {0, 1, 2}:
                pred.answer = str(last_number_int)# this is used for the compiling of the judge
                #print(f"From LM_EVAL: {pred.score}, type: {type(pred.score)}, after assigning value to it")
                if trace is None:
                    self.hist.append(pred.answer)
                    return pred 
                else:
                    print("trace is being used")
                    boolean = pred.answer == 2
                    pred.answer = boolean
                    self.hist.append(pred.answer)
                    return pred
            else:
                if trace is None:
                    return 0
                else:
                    return False
        else:
            if trace is None:
                self.hist.append(0)
                return 0
            else:
                self.hist.append(False)
                return False
            #print(f"From LM_EVAL: {pred.answer},type: {type(pred.answer)} did not take a single number as output")
    def get_reasoning(self):
        return self.reasoning
    def get_history(self):
        return self.hist

# Loading Data

In [6]:
data = pd.read_csv("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/benchmark/Judge_labels_big.csv")
print(data.head())


                                            question         example_answer  \
0                    How many events are in the log?                 561470   
1                When is the start of the event log?  1/1/2000 / 2000-01-01   
2                  When is the end of the event log?  6/18/2013/ 2013-06-18   
3  How many cases have sent an appeal to the Pref...                   4141   
4                    How many event types are there?                     11   

                                         pred_answer  score  
0                                             561470      2  
1  The start of the event log is January 1, 2000,...      2  
2  The end of the event log is on 2013-06-18 00:0...      2  
3                                               4188      0  
4                                                 11      2  


In [7]:
len(data)

103

In [8]:
judge_dataset = []

for question, example, prediction, answer in data.values:
    
    
    judge_dataset.append(dspy.Example(question = question, example = example, prediction = prediction, answer = str(answer)).with_inputs("question", "example", "prediction"))

print(judge_dataset[26:32])


[Example({'question': 'What is the precentage of cases that where credit collected?', 'example': '0.3924 or 39.24%', 'prediction': 'The percentage of cases where credit was collected is approximately 39.25%.', 'answer': '2'}) (input_keys={'question', 'prediction', 'example'}), Example({'question': 'What is the percentage of cases in which a penalty has been added?', 'example': '0.531 or 53.1%', 'prediction': 'The percentage of cases in which a penalty has been added is approximately 53.11%.', 'answer': '2'}) (input_keys={'question', 'prediction', 'example'}), Example({'question': 'How many cases have been added a penalty?', 'example': '79860', 'prediction': '79860', 'answer': '2'}) (input_keys={'question', 'prediction', 'example'}), Example({'question': 'How many cases have been added no penalties?', 'example': '70510', 'prediction': '70510', 'answer': '2'}) (input_keys={'question', 'prediction', 'example'}), Example({'question': 'Which case has the highest maxtotalPaymentAmount?', 'ex

In [9]:
# split into train and test with 80% train and 20% test
random.shuffle(judge_dataset)
train_data = judge_dataset[:int(len(judge_dataset)*0.8)]
test_data = judge_dataset[int(len(judge_dataset)*0.8):]

In [10]:
gpt = dspy.LM(model='openai/gpt-4o', temperature=0.3, max_tokens=3000, stop=None, cache=False)
judge = LM_EVAL(gpt)

In [11]:
score_accuracy = dspy.evaluate.metrics.answer_exact_match
evaluate = Evaluate(devset=test_data, metric=score_accuracy, num_threads=1, display_progress=True, display_table=len(test_data), return_outputs=True)

In [12]:
score, outputs, scores = evaluate(program = judge, return_all_scores= True)

I0000 00:00:1734686120.937047 9984122 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


Average Metric: 20.00 / 21 (95.2%): 100%|██████████| 21/21 [00:33<00:00,  1.58s/it]

2024/12/20 10:15:54 INFO dspy.evaluate.evaluate: Average Metric: 20 / 21 (95.2%)





Unnamed: 0,question,example,prediction,example_answer,reasoning,pred_answer,answer_exact_match
0,How many times do each of the vehicleClasses occur?,A 146712\nC 2456\nM 1198\nR 4,None: 411100\nA: 146712\nC: 2456\nM: 1198\nR: 4,2,"The prediction includes an additional category ""None: 411100"" whic...",2,✔️ [True]
1,Which 10 cases have the highest amount last?,The top 9 cases with the highest amount last are: C18395: 800000 C...,The 10 cases with the highest amount last are: 1. C15814 - 800000 ...,2,"The prediction lists the top 10 cases with the highest amounts, an...",1,
2,What is the percentage of cases which have paid nothing?,0.5363 or 53.63%,The percentage of cases which have paid nothing is approximately 5...,2,The prediction states that the percentage of cases which have paid...,2,✔️ [True]
3,What is the percentage of cases which have appealed to the judge?,0.369 or 36.9%,The percentage of cases which have appealed to the judge is approx...,2,The solution states that the percentage of cases which have appeal...,2,✔️ [True]
4,How many cases have been dismissed by the prefecture?,1980,1980,2,The prediction matches the solution exactly in terms of content. B...,2,✔️ [True]
5,How many dismissal types are there?,26,26,2,The prediction matches the solution exactly in terms of content. B...,2,✔️ [True]
6,How many cases have been added a penalty?,79860,79860,2,The prediction matches the solution exactly in terms of numerical ...,2,✔️ [True]
7,What is the lowest amount last found in the event log?,0,The lowest amount last found in the event log is 0.,2,"The prediction states ""The lowest amount last found in the event l...",2,✔️ [True]
8,What is the average maxtotalPaymentAmount per case?,2396.577176,The average maxtotalPaymentAmount per case is approximately 2396.58.,2,The prediction states that the average maxtotalPaymentAmount per c...,2,✔️ [True]
9,How many cases have credit_collected == True AND time_timestamp_en...,9383,9383,2,The prediction matches the solution exactly in terms of content. B...,2,✔️ [True]


# Testing MIPROv2

In [13]:
from dspy.teleprompt import MIPROv2

In [18]:
teleprompter = MIPROv2(
    metric=score_accuracy,
    auto="medium", # Can choose between light, medium, and heavy optimization runs
)

In [19]:
zeroshot_optimized_program = teleprompter.compile(
    judge.deepcopy(),
    trainset=train_data,
    max_bootstrapped_demos=8, # ZERO FEW-SHOT EXAMPLES
    max_labeled_demos=0, # ZERO FEW-SHOT EXAMPLES
    requires_permission_to_run=False,
)

2024/12/20 10:25:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: True
num_candidates: 19
valset size: 65

2024/12/20 10:25:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2024/12/20 10:25:43 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2024/12/20 10:25:43 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...


Bootstrapping set 1/19
Bootstrapping set 2/19


 18%|█▊        | 3/17 [00:02<00:12,  1.08it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 3/19


 53%|█████▎    | 9/17 [00:10<00:09,  1.14s/it]


Bootstrapped 8 full traces after 9 examples for up to 1 rounds, amounting to 9 attempts.
Bootstrapping set 4/19


 41%|████      | 7/17 [00:07<00:11,  1.14s/it]


Bootstrapped 6 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 5/19


  6%|▌         | 1/17 [00:01<00:25,  1.60s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/19


 47%|████▋     | 8/17 [00:10<00:11,  1.31s/it]


Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 7/19


 18%|█▊        | 3/17 [00:02<00:13,  1.07it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 8/19


  6%|▌         | 1/17 [00:00<00:14,  1.10it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 9/19


 47%|████▋     | 8/17 [00:09<00:10,  1.14s/it]


Bootstrapped 7 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 10/19


  6%|▌         | 1/17 [00:00<00:14,  1.11it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 11/19


  6%|▌         | 1/17 [00:00<00:14,  1.10it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 12/19


 59%|█████▉    | 10/17 [00:11<00:08,  1.15s/it]


Bootstrapped 8 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.
Bootstrapping set 13/19


 24%|██▎       | 4/17 [00:06<00:20,  1.61s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 14/19


 12%|█▏        | 2/17 [00:01<00:14,  1.03it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 15/19


 24%|██▎       | 4/17 [00:05<00:18,  1.43s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 16/19


  6%|▌         | 1/17 [00:01<00:16,  1.01s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 17/19


  6%|▌         | 1/17 [00:01<00:19,  1.24s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 18/19


 29%|██▉       | 5/17 [00:06<00:16,  1.37s/it]


Bootstrapped 5 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 19/19


 35%|███▌      | 6/17 [00:10<00:18,  1.72s/it]
2024/12/20 10:27:16 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2024/12/20 10:27:16 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 5 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.


2024/12/20 10:27:24 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2024/12/20 10:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2024/12/20 10:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Assess how closely the Answer matches the prediction in relation to the question. Differences in formating are less important than the actual content. Plausible methods to arrive at the answer are not considered. Only consider the Answer and the Prediction + the Question, nothing else is relevant to you.

2024/12/20 10:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Evaluate the degree of accuracy between the provided prediction and the correct solution for a given question. Focus solely on the content of the prediction and the solution, disregarding any formatting differences or methodologies used to derive them. Your task is to determine if the prediction is completely incorrect (score 0), partially correct (score 1), or an exact ma

Average Metric: 63.00 / 65 (96.9%): 100%|██████████| 65/65 [00:15<00:00,  4.31it/s] 

2024/12/20 10:29:54 INFO dspy.evaluate.evaluate: Average Metric: 63 / 65 (96.9%)
2024/12/20 10:29:54 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 96.92

2024/12/20 10:29:54 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2024/12/20 10:29:54 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2024/12/20 10:29:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:06<00:00,  4.07it/s] 

2024/12/20 10:30:01 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].
2024/12/20 10:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0]
2024/12/20 10:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:01 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:04<00:00,  5.01it/s] 

2024/12/20 10:30:06 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:30:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].
2024/12/20 10:30:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0]
2024/12/20 10:30:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==



Average Metric: 25.00 / 25 (100.0%): 100%|██████████| 25/25 [00:06<00:00,  3.60it/s]

2024/12/20 10:30:13 INFO dspy.evaluate.evaluate: Average Metric: 25 / 25 (100.0%)
2024/12/20 10:30:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].
2024/12/20 10:30:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0]
2024/12/20 10:30:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:05<00:00,  4.36it/s] 

2024/12/20 10:30:18 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:30:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].
2024/12/20 10:30:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0]
2024/12/20 10:30:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==



Average Metric: 22.00 / 25 (88.0%): 100%|██████████| 25/25 [00:06<00:00,  3.75it/s]

2024/12/20 10:30:25 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)
2024/12/20 10:30:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].
2024/12/20 10:30:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0]
2024/12/20 10:30:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==



Average Metric: 25.00 / 25 (100.0%): 100%|██████████| 25/25 [00:05<00:00,  4.70it/s]

2024/12/20 10:30:31 INFO dspy.evaluate.evaluate: Average Metric: 25 / 25 (100.0%)
2024/12/20 10:30:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].
2024/12/20 10:30:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0]
2024/12/20 10:30:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:05<00:00,  4.35it/s] 

2024/12/20 10:30:36 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:30:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].
2024/12/20 10:30:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0]
2024/12/20 10:30:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:07<00:00,  3.57it/s] 

2024/12/20 10:30:43 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:30:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].
2024/12/20 10:30:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0]
2024/12/20 10:30:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|██████████| 25/25 [00:06<00:00,  4.00it/s] 

2024/12/20 10:30:50 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2024/12/20 10:30:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].
2024/12/20 10:30:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0]
2024/12/20 10:30:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:05<00:00,  4.26it/s] 

2024/12/20 10:30:56 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:30:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 1'].
2024/12/20 10:30:56 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0]
2024/12/20 10:30:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92]
2024/12/20 10:30:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:30:56 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====
2024/12/20 10:30:56 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 100.0) from minibatch trials...



Average Metric: 60.00 / 65 (92.3%): 100%|██████████| 65/65 [00:15<00:00,  4.16it/s] 

2024/12/20 10:31:11 INFO dspy.evaluate.evaluate: Average Metric: 60 / 65 (92.3%)
2024/12/20 10:31:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:31:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92
2024/12/20 10:31:11 INFO dspy.teleprompt.mipro_optimizer_v2: 

2024/12/20 10:31:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|██████████| 25/25 [00:09<00:00,  2.61it/s] 

2024/12/20 10:31:21 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2024/12/20 10:31:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].
2024/12/20 10:31:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0]
2024/12/20 10:31:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:31:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:31:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:05<00:00,  4.26it/s] 

2024/12/20 10:31:27 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:31:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 14'].
2024/12/20 10:31:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0]
2024/12/20 10:31:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:31:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:31:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|██████████| 25/25 [00:06<00:00,  4.06it/s]

2024/12/20 10:31:33 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2024/12/20 10:31:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].
2024/12/20 10:31:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0]
2024/12/20 10:31:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:31:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:31:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==



Average Metric: 25.00 / 25 (100.0%): 100%|██████████| 25/25 [00:07<00:00,  3.13it/s]

2024/12/20 10:31:41 INFO dspy.evaluate.evaluate: Average Metric: 25 / 25 (100.0%)
2024/12/20 10:31:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 10'].
2024/12/20 10:31:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0]
2024/12/20 10:31:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:31:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:31:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|██████████| 25/25 [00:06<00:00,  3.96it/s] 

2024/12/20 10:31:48 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2024/12/20 10:31:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2024/12/20 10:31:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0]
2024/12/20 10:31:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:31:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:31:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:06<00:00,  4.13it/s] 

2024/12/20 10:31:54 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 3'].
2024/12/20 10:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0]
2024/12/20 10:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:06<00:00,  3.92it/s] 

2024/12/20 10:32:00 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:32:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].
2024/12/20 10:32:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0]
2024/12/20 10:32:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:32:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==



Average Metric: 25.00 / 25 (100.0%): 100%|██████████| 25/25 [00:04<00:00,  5.20it/s]

2024/12/20 10:32:05 INFO dspy.evaluate.evaluate: Average Metric: 25 / 25 (100.0%)
2024/12/20 10:32:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9'].
2024/12/20 10:32:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0]
2024/12/20 10:32:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:32:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:05<00:00,  4.58it/s]

2024/12/20 10:32:11 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2024/12/20 10:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0, 96.0]
2024/12/20 10:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:06<00:00,  4.09it/s]

2024/12/20 10:32:17 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:32:17 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 8'].
2024/12/20 10:32:17 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0, 96.0, 96.0]
2024/12/20 10:32:17 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31]
2024/12/20 10:32:17 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:17 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====
2024/12/20 10:32:17 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 100.0) from minibatch trials...



Average Metric: 62.00 / 65 (95.4%): 100%|██████████| 65/65 [00:15<00:00,  4.30it/s] 

2024/12/20 10:32:32 INFO dspy.evaluate.evaluate: Average Metric: 62 / 65 (95.4%)
2024/12/20 10:32:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31, 95.38]
2024/12/20 10:32:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92
2024/12/20 10:32:32 INFO dspy.teleprompt.mipro_optimizer_v2: 

2024/12/20 10:32:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:06<00:00,  4.04it/s] 

2024/12/20 10:32:38 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 2'].
2024/12/20 10:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0, 96.0, 96.0, 96.0]
2024/12/20 10:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31, 95.38]
2024/12/20 10:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==



Average Metric: 24.00 / 25 (96.0%): 100%|██████████| 25/25 [00:05<00:00,  4.79it/s] 

2024/12/20 10:32:44 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)
2024/12/20 10:32:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 17'].
2024/12/20 10:32:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0, 96.0, 96.0, 96.0, 96.0]
2024/12/20 10:32:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31, 95.38]
2024/12/20 10:32:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==



Average Metric: 22.00 / 25 (88.0%): 100%|██████████| 25/25 [00:08<00:00,  3.04it/s]

2024/12/20 10:32:52 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)
2024/12/20 10:32:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 10'].
2024/12/20 10:32:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0, 96.0, 96.0, 96.0, 96.0, 88.0]
2024/12/20 10:32:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31, 95.38]
2024/12/20 10:32:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|██████████| 25/25 [00:05<00:00,  4.99it/s]

2024/12/20 10:32:57 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2024/12/20 10:32:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 10'].
2024/12/20 10:32:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0, 96.0, 96.0, 96.0, 96.0, 88.0, 92.0]
2024/12/20 10:32:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31, 95.38]
2024/12/20 10:32:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:32:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|██████████| 25/25 [00:09<00:00,  2.70it/s] 

2024/12/20 10:33:06 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2024/12/20 10:33:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 11'].
2024/12/20 10:33:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 96.0, 100.0, 96.0, 88.0, 100.0, 96.0, 96.0, 92.0, 96.0, 92.0, 96.0, 92.0, 100.0, 92.0, 96.0, 96.0, 100.0, 96.0, 96.0, 96.0, 96.0, 88.0, 92.0, 92.0]
2024/12/20 10:33:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31, 95.38]
2024/12/20 10:33:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92


2024/12/20 10:33:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====
2024/12/20 10:33:06 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 100.0) from minibatch trials...



Average Metric: 61.00 / 65 (93.8%): 100%|██████████| 65/65 [00:13<00:00,  4.94it/s]

2024/12/20 10:33:20 INFO dspy.evaluate.evaluate: Average Metric: 61 / 65 (93.8%)
2024/12/20 10:33:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [96.92, 92.31, 95.38, 93.85]
2024/12/20 10:33:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 96.92
2024/12/20 10:33:20 INFO dspy.teleprompt.mipro_optimizer_v2: 

2024/12/20 10:33:20 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 96.92!





In [22]:
zeroshot_optimized_program.save("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/judge_miprov2_final_meta.json", save_field_meta=True)

TypeError: BaseModule.save() got an unexpected keyword argument 'save_field_meta'

In [26]:
judge.load("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/judge_miprov2_2.json")

In [67]:
judge2.load("/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/judge_miprov2_2.json")

In [27]:
score, outputs, scores = evaluate(program = zeroshot_optimized_program, return_all_scores= True)

Average Metric: 19.00 / 21 (90.5%): 100%|██████████| 21/21 [00:32<00:00,  1.53s/it]

2024/12/20 10:45:51 INFO dspy.evaluate.evaluate: Average Metric: 19 / 21 (90.5%)





Unnamed: 0,question,example,prediction,example_answer,reasoning,pred_answer,answer_exact_match
0,How many times do each of the vehicleClasses occur?,A 146712\nC 2456\nM 1198\nR 4,None: 411100\nA: 146712\nC: 2456\nM: 1198\nR: 4,2,The question asks for the count of occurrences for each of the veh...,1,
1,Which 10 cases have the highest amount last?,The top 9 cases with the highest amount last are: C18395: 800000 C...,The 10 cases with the highest amount last are: 1. C15814 - 800000 ...,2,"The prediction lists 10 cases with their respective amounts, and i...",1,
2,What is the percentage of cases which have paid nothing?,0.5363 or 53.63%,The percentage of cases which have paid nothing is approximately 5...,2,The prediction states that the percentage of cases which have paid...,2,✔️ [True]
3,What is the percentage of cases which have appealed to the judge?,0.369 or 36.9%,The percentage of cases which have appealed to the judge is approx...,2,The prediction states that the percentage of cases which have appe...,2,✔️ [True]
4,How many cases have been dismissed by the prefecture?,1980,1980,2,"The prediction matches the solution exactly, both stating that 198...",2,✔️ [True]
5,How many dismissal types are there?,26,26,2,The prediction matches the solution exactly in terms of content. B...,2,✔️ [True]
6,How many cases have been added a penalty?,79860,79860,2,The prediction matches the solution exactly in terms of content. B...,2,✔️ [True]
7,What is the lowest amount last found in the event log?,0,The lowest amount last found in the event log is 0.,2,The prediction states that the lowest amount last found in the eve...,2,✔️ [True]
8,What is the average maxtotalPaymentAmount per case?,2396.577176,The average maxtotalPaymentAmount per case is approximately 2396.58.,2,The prediction states that the average maxtotalPaymentAmount per c...,2,✔️ [True]
9,How many cases have credit_collected == True AND time_timestamp_en...,9383,9383,2,The prediction matches the solution exactly in terms of the numeri...,2,✔️ [True]


## Normal bootstrapped viewshot optimization

In [69]:
from dspy.teleprompt import BootstrapFewShot

fewshot_optimizer = BootstrapFewShot(metric=score_accuracy, max_bootstrapped_demos=8, max_labeled_demos=2, max_rounds=2, max_errors=5)

your_dspy_program_compiled = fewshot_optimizer.compile(student = judge, trainset=train_data)

 29%|██▊       | 8/28 [00:17<00:44,  2.22s/it]

Bootstrapped 8 full traces after 8 examples for up to 2 rounds, amounting to 8 attempts.





In [70]:
your_dspy_program_compiled.save(f"/Users/sulzair/Documents/Bachelor Thesis/dspy_v2/Optimized_prompts/judge_bootstrap.json")