# Optimization with DSPY

In [10]:
import dspy
from dspy.evaluate import Evaluate
from typing import Literal
import pandas as pd
import numpy as np
import time
import bert_score
import warnings
from transformers import logging
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

### Settings

In [None]:
wait_time = 0 # impostare se ci sono rate limits, altrimenti lasciare 0
debug_mode = False

In [3]:
data_folder = './data/'
result_folder = './results/'
prompt_folder = './prompts/'
optimizers_folder = './optimizers/'

### LLM

In [4]:
model_name = 'mistral'
lm = dspy.LM('mistral/mistral-large-latest', api_key = os.getenv("MISTRAL_API_KEY")) # Default temperature = 0.0
dspy.configure(lm = lm)

In [5]:
# TEST
# model_name = 'chatgpt'
# dspy.LM('openai/gpt-4o-mini', api_key = os.getenv("OPENAI_API_KEY"))
# dspy.configure(lm = lm)

In [6]:
# TEST
#model_name = 'llama'
#lm = dspy.LM('ollama_chat/llama3.2', api_base='http://localhost:11434', api_key='')
#dspy.configure(lm = lm)

In [7]:
# TEST
# model_name = 'qwen'
# lm = dspy.LM(
#     'openai/qwen2.5-72b-instruct-large-latest', 
#     api_key = os.getenv('DASHSCOPE_API_KEY'),
#     api_base = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1'
# )
# dspy.configure(lm = lm)

In [8]:
# TEST
# model_name = 'deepseek'
# lm = dspy.LM(
#     'deepseek/deepseek-chat', 
#     api_key = os.getenv('DEEPSEEK_API_KEY'),
#     api_base = 'https://api.deepseek.com'
# )
# dspy.configure(lm = lm)

In [9]:
# TEST
print(lm("What is 2+2?"))

['The sum of 2 + 2 is 4. Here it is:\n\n2\n+2\n____\n4']


### Prompt ToT

In [11]:
with open(prompt_folder + 'prompt_tot.txt') as f:
    tree_of_thoughts = f.read()
print(tree_of_thoughts)

Follow this procedure:

Imagine three different experts are answering this question.
They will brainstorm the answer step by step reasoning carefully and taking all facts into consideration.
All experts will write down 1 step of their thinking, then share it with the group.
They will each critique their response, and the all the responses of others.
They will check their answer on based on the nature of the language and intent.
Then all experts will go on to the next step and write down this step of their thinking.
They will keep going through steps until they reach their conclusion taking into account the thoughts of the other experts.
If at any time they realise that there uncertainty in their logic they will backtrack to where that uncertainty occurred. 
If any expert realises they're wrong at any point then they acknowledges this and start another train of thought.
Each expert will assign a likelihood of their current assertion being correct.
Continue until the experts agree on the

# Stage 1: High Level Categorization

In [12]:
def optimized_classify_stage_1(optimizer, df_hate, result_file, initialize = False):
    
    if initialize:
        # Regenerate output file
        df = pd.DataFrame(columns = ['post_id', 'class','confidence','explanation'])
        df.to_csv(result_file, encoding = "utf-8", index = False)
    else:
        df_elab = pd.read_csv(result_file, encoding = "utf-8")
        df_hate = df_hate[~df_hate['post_id'].isin(df_elab['post_id'])]

    for idx, row in df_hate.iterrows():
    
        try:            
            time.sleep(wait_time)
            classification = optimizer(post = row['post'])
        
            # Classification stage 1
            new_row = {
                'post_id': row['post_id'], 
                'class': classification.hate_class, 
                'confidence': classification.confidence,
                'explanation': classification.explanation
            }
            df = pd.DataFrame([new_row])
            df.to_csv(result_file, encoding = "utf-8", mode='a', index=False, header=False)
                      
        except:
          print("An exception occurred " +  str(row['post_id']) ) 

### Signature

In [13]:
with open(prompt_folder + 'prompt_stg1.txt') as f:
    prompt_stg1 = f.read()

In [14]:
prompt_stg1 = prompt_stg1.replace("{tree_of_thoughts}", tree_of_thoughts)

In [15]:
print(prompt_stg1)

Your task is to decide whether the following text can be classified an implicit hate speech or not implicit hate speech.

Consider the following definition of hate speech:

Hate speech is content that targets individuals or groups with abuse based on their perceived membership in protected categories, including but not limited to race, ethnicity, national origin, caste, sexual orientation, gender, gender identity, religious affiliation, age, disability, or serious disease. 
Specifically, hate speech can contain:

- Hateful References: Content referencing forms of violence or violent events where a protected category was the primary target, intended to harass (e.g., genocides like the Holocaust, lynchings).
- Incitement: Content inciting fear, spreading fearful stereotypes, or encouraging others to harass or discriminate against members of protected categories (e.g., asserting that members of a religious group are terrorists, urging others to harass individuals wearing religious symbols

In [16]:
class ClassifyHate(dspy.Signature):
    prompt_stg1

    post: str = dspy.InputField()
    hate_class: Literal["implicit_hate", "not_hate"]  = dspy.OutputField()
    confidence: float = dspy.OutputField()
    explanation : str = dspy.OutputField()

classify_hate = dspy.Predict(ClassifyHate)

### Metric

In [17]:
def classify_hate_metric(example, prediction, trace=None):
    time.sleep(wait_time)
    return prediction.hate_class == example.hate_class

### Trainset

In [18]:
df_train = pd.read_csv(data_folder + "implicit_hate_train_stg1.csv", encoding="utf-8")

In [19]:
trainset = []
for index, row in df_train.sample(100).iterrows():
    example = dspy.Example(post = row['post'], hate_class = row['class']).with_inputs("post")
    trainset.append(example)

### Evaluator

In [43]:
evaluator = Evaluate(devset = trainset, num_threads = 1, display_progress = True, display_table = 5)
evaluator(classify_hate, metric = classify_hate_metric)

Average Metric: 56.00 / 100 (56.0%): 100%|███████████████████████████████████████████| 100/100 [01:40<00:00,  1.01s/it]

2025/01/08 12:03:10 INFO dspy.evaluate.evaluate: Average Metric: 56 / 100 (56.0%)





Unnamed: 0,post,example_hate_class,pred_hate_class,confidence,explanation,classify_hate_metric
0,lol ! you guys talk so good . words use to make the fancy statemen...,not_hate,implicit_hate,0.75,"The post contains sarcastic language with phrases like ""lol"" and ""...",
1,a petition stop christian court martial policy-check out this arti...,not_hate,not_hate,0.95,The post is sharing an article and expressing a political viewpoin...,✔️ [True]
2,uk telegraph : racial conflicts will dominate the 21st century via,not_hate,implicit_hate,0.75,The post suggests that racial conflicts will be a significant issu...,
3,what have black people contributed to science ?,implicit_hate,implicit_hate,0.85,The post implies a stereotype or prejudice against black people by...,✔️ [True]
4,white supremacists are glad boehners leaving via,not_hate,implicit_hate,0.85,"The post mentions ""white supremacists"" in a context that suggests ...",


56.0

In [16]:
optimized_classify_hate.save(optimizers_folder + model_name + '_' + 'optimizer_stg1_v1.json')

### Optimization with MIPRO (prompt and examples)

In [45]:
classify_hate_optimizer = dspy.MIPROv2(
    metric = classify_hate_metric, 
    auto = "heavy"
)

optimized_classify_hate = classify_hate_optimizer.compile(
    classify_hate, 
    trainset = trainset, 
    max_labeled_demos = 8, 
    max_bootstrapped_demos = 8
)

2025/01/08 12:06:33 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING HEAVY AUTO RUN SETTINGS:
num_trials: 50
minibatch: True
num_candidates: 38
valset size: 80



[93m[1mProjected Language Model (LM) Calls[0m

Based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Prompt Generation: [94m[1m10[0m[93m data summarizer calls + [94m[1m38[0m[93m * [94m[1m1[0m[93m lm calls in program + ([94m[1m2[0m[93m) lm calls in program-aware proposer = [94m[1m50[0m[93m prompt model calls[0m
[93m- Program Evaluation: [94m[1m25[0m[93m examples in minibatch * [94m[1m50[0m[93m batches + [94m[1m80[0m[93m examples in val set * [94m[1m6[0m[93m full evals = [94m[1m1730[0m[93m LM Program calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Mode

2025/01/08 12:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/08 12:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/08 12:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=38 sets of demonstrations...


Bootstrapping set 1/38
Bootstrapping set 2/38
Bootstrapping set 3/38


 45%|█████████████████████████████████████▎                                             | 9/20 [00:27<00:34,  3.10s/it]


Bootstrapped 8 full traces after 9 examples for up to 1 rounds, amounting to 9 attempts.
Bootstrapping set 4/38


 30%|████████████████████████▉                                                          | 6/20 [00:18<00:42,  3.07s/it]


Bootstrapped 6 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 5/38


 25%|████████████████████▊                                                              | 5/20 [00:27<01:21,  5.46s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 6/38


 10%|████████▎                                                                          | 2/20 [00:06<00:55,  3.07s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 7/38


 20%|████████████████▌                                                                  | 4/20 [00:12<00:49,  3.09s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 8/38


 15%|████████████▍                                                                      | 3/20 [00:08<00:50,  3.00s/it]


Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 9/38


 15%|████████████▍                                                                      | 3/20 [00:08<00:48,  2.86s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 10/38


 45%|█████████████████████████████████████▎                                             | 9/20 [00:28<00:34,  3.13s/it]


Bootstrapped 7 full traces after 9 examples for up to 1 rounds, amounting to 9 attempts.
Bootstrapping set 11/38


  5%|████▏                                                                              | 1/20 [00:03<00:59,  3.12s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 12/38


 35%|█████████████████████████████                                                      | 7/20 [00:20<00:38,  2.93s/it]


Bootstrapped 7 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 13/38


 30%|████████████████████████▉                                                          | 6/20 [00:18<00:43,  3.10s/it]


Bootstrapped 6 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 14/38


  5%|████▏                                                                              | 1/20 [00:02<00:56,  2.97s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 15/38


 30%|████████████████████████▉                                                          | 6/20 [00:17<00:40,  2.88s/it]


Bootstrapped 5 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 16/38


  5%|████▏                                                                              | 1/20 [00:03<00:58,  3.10s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 17/38


 45%|█████████████████████████████████████▎                                             | 9/20 [00:27<00:33,  3.07s/it]


Bootstrapped 8 full traces after 9 examples for up to 1 rounds, amounting to 9 attempts.
Bootstrapping set 18/38


 15%|████████████▍                                                                      | 3/20 [00:20<01:55,  6.78s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 19/38


 40%|█████████████████████████████████▏                                                 | 8/20 [00:24<00:37,  3.11s/it]


Bootstrapped 5 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 20/38


 35%|█████████████████████████████                                                      | 7/20 [00:22<00:41,  3.18s/it]


Bootstrapped 6 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 21/38


 10%|████████▎                                                                          | 2/20 [00:05<00:53,  2.99s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 22/38


 10%|████████▎                                                                          | 2/20 [00:05<00:49,  2.76s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 23/38


 10%|████████▎                                                                          | 2/20 [00:06<00:55,  3.10s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 24/38


 40%|█████████████████████████████████▏                                                 | 8/20 [00:27<00:40,  3.40s/it]


Bootstrapped 6 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 25/38


 10%|████████▎                                                                          | 2/20 [00:11<01:39,  5.54s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 26/38


 10%|████████▎                                                                          | 2/20 [00:06<00:55,  3.09s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 27/38


  5%|████▏                                                                              | 1/20 [00:02<00:56,  2.99s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 28/38


 20%|████████████████▌                                                                  | 4/20 [00:11<00:46,  2.91s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 29/38


 25%|████████████████████▊                                                              | 5/20 [00:24<01:13,  4.89s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 30/38


 35%|█████████████████████████████                                                      | 7/20 [00:20<00:38,  2.98s/it]


Bootstrapped 5 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 31/38


 10%|████████▎                                                                          | 2/20 [00:07<01:09,  3.86s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 32/38


 20%|████████████████▌                                                                  | 4/20 [00:14<00:56,  3.50s/it]


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 33/38


 50%|█████████████████████████████████████████                                         | 10/20 [00:33<00:33,  3.37s/it]


Bootstrapped 8 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.
Bootstrapping set 34/38


 15%|████████████▍                                                                      | 3/20 [00:09<00:52,  3.10s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 35/38


 50%|█████████████████████████████████████████                                         | 10/20 [00:32<00:32,  3.21s/it]


Bootstrapped 8 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.
Bootstrapping set 36/38


 30%|████████████████████████▉                                                          | 6/20 [00:21<00:49,  3.57s/it]


Bootstrapped 5 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 37/38


 40%|█████████████████████████████████▏                                                 | 8/20 [00:26<00:39,  3.28s/it]


Bootstrapped 5 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 38/38


 25%|████████████████████▊                                                              | 5/20 [00:14<00:44,  2.97s/it]
2025/01/08 12:16:17 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/08 12:16:17 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.


2025/01/08 12:16:32 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/01/08 12:25:31 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/01/08 12:25:31 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `post`, produce the fields `hate_class`, `confidence`, `explanation`.

2025/01/08 12:25:31 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are a content moderator for a large social media platform, and you are tasked with identifying and classifying hate speech in user posts. Given a `post`, your job is to determine the `hate_class` ('not_hate', 'implicit_hate', or 'explicit_hate'), assign a `confidence` score to your classification, and provide a brief `explanation` for your decision. Remember that misclassifications can lead to severe real-world consequences, including the proliferation of harmful content or the unfair silencing of voices. Accuracy and nuanced understanding are crucial in this high-stakes scenario.

2

Average Metric: 44.00 / 80 (55.0%): 100%|██████████████████████████████████████████████| 80/80 [00:14<00:00,  5.71it/s]

2025/01/08 12:25:45 INFO dspy.evaluate.evaluate: Average Metric: 44 / 80 (55.0%)
2025/01/08 12:25:45 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 55.0

2025/01/08 12:25:45 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2025/01/08 12:25:45 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2025/01/08 12:25:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 50 ==



Average Metric: 19.00 / 25 (76.0%): 100%|██████████████████████████████████████████████| 25/25 [00:19<00:00,  1.25it/s]

2025/01/08 12:26:05 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)
2025/01/08 12:26:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 26', 'Predictor 0: Few-Shot Set 10'].
2025/01/08 12:26:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0]
2025/01/08 12:26:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:26:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:26:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 50 ==



Average Metric: 20.00 / 25 (80.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.21it/s]

2025/01/08 12:26:26 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)
2025/01/08 12:26:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 15'].
2025/01/08 12:26:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0]
2025/01/08 12:26:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:26:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:26:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.20it/s]

2025/01/08 12:26:47 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:26:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 20'].
2025/01/08 12:26:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0]
2025/01/08 12:26:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:26:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:26:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.19it/s]

2025/01/08 12:27:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/08 12:27:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 11'].
2025/01/08 12:27:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0]
2025/01/08 12:27:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:27:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:27:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 50 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████████| 25/25 [00:26<00:00,  1.07s/it]

2025/01/08 12:27:34 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/08 12:27:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 20'].
2025/01/08 12:27:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0]
2025/01/08 12:27:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:27:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:27:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 50 ==



Average Metric: 15.00 / 25 (60.0%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.19it/s]

2025/01/08 12:27:55 INFO dspy.evaluate.evaluate: Average Metric: 15 / 25 (60.0%)
2025/01/08 12:27:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 21', 'Predictor 0: Few-Shot Set 0'].
2025/01/08 12:27:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0]
2025/01/08 12:27:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:27:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:27:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 50 ==



Average Metric: 20.00 / 25 (80.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.21it/s]

2025/01/08 12:28:16 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)
2025/01/08 12:28:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 34', 'Predictor 0: Few-Shot Set 9'].
2025/01/08 12:28:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0]
2025/01/08 12:28:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:28:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:28:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 50 ==



Average Metric: 20.00 / 25 (80.0%): 100%|██████████████████████████████████████████████| 25/25 [00:19<00:00,  1.28it/s]

2025/01/08 12:28:36 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)
2025/01/08 12:28:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 32', 'Predictor 0: Few-Shot Set 3'].
2025/01/08 12:28:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0]
2025/01/08 12:28:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:28:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:28:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 50 ==



Average Metric: 19.00 / 25 (76.0%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.19it/s]

2025/01/08 12:28:57 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)
2025/01/08 12:28:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 21', 'Predictor 0: Few-Shot Set 26'].
2025/01/08 12:28:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0]
2025/01/08 12:28:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:28:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:28:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.01it/s]

2025/01/08 12:29:22 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/08 12:29:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 24'].
2025/01/08 12:29:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0]
2025/01/08 12:29:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0]
2025/01/08 12:29:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/01/08 12:29:22 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====
2025/01/08 12:29:22 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...



Average Metric: 56.00 / 80 (70.0%): 100%|██████████████████████████████████████████████| 80/80 [00:45<00:00,  1.77it/s]

2025/01/08 12:30:07 INFO dspy.evaluate.evaluate: Average Metric: 56 / 80 (70.0%)
2025/01/08 12:30:07 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 70.0
2025/01/08 12:30:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:30:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0
2025/01/08 12:30:07 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/08 12:30:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.23it/s]

2025/01/08 12:30:27 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:30:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 16'].
2025/01/08 12:30:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0]
2025/01/08 12:30:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:30:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:30:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.15it/s]

2025/01/08 12:30:49 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/08 12:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 9'].
2025/01/08 12:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0]
2025/01/08 12:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.07it/s]

2025/01/08 12:31:12 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/08 12:31:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24', 'Predictor 0: Few-Shot Set 15'].
2025/01/08 12:31:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0]
2025/01/08 12:31:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:31:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:31:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:19<00:00,  1.29it/s]

2025/01/08 12:31:32 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:31:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 27', 'Predictor 0: Few-Shot Set 9'].
2025/01/08 12:31:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0]
2025/01/08 12:31:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:31:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:31:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 50 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.11it/s]

2025/01/08 12:31:54 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/08 12:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 34', 'Predictor 0: Few-Shot Set 30'].
2025/01/08 12:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0]
2025/01/08 12:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:31:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 50 ==



Average Metric: 15.00 / 25 (60.0%): 100%|██████████████████████████████████████████████| 25/25 [00:18<00:00,  1.36it/s]

2025/01/08 12:32:13 INFO dspy.evaluate.evaluate: Average Metric: 15 / 25 (60.0%)
2025/01/08 12:32:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 15'].
2025/01/08 12:32:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0]
2025/01/08 12:32:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:32:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:32:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 50 ==



Average Metric: 20.00 / 25 (80.0%): 100%|██████████████████████████████████████████████| 25/25 [00:26<00:00,  1.05s/it]

2025/01/08 12:32:39 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)
2025/01/08 12:32:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 36'].
2025/01/08 12:32:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0]
2025/01/08 12:32:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:32:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:32:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 50 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████████| 25/25 [00:19<00:00,  1.27it/s]

2025/01/08 12:32:59 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/08 12:32:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 34', 'Predictor 0: Few-Shot Set 7'].
2025/01/08 12:32:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0]
2025/01/08 12:32:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:32:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:32:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 50 ==



Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.15it/s]

2025/01/08 12:33:21 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)
2025/01/08 12:33:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 8'].
2025/01/08 12:33:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0]
2025/01/08 12:33:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:33:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:33:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:19<00:00,  1.29it/s]

2025/01/08 12:33:40 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:33:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24', 'Predictor 0: Few-Shot Set 8'].
2025/01/08 12:33:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0]
2025/01/08 12:33:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0]
2025/01/08 12:33:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:33:40 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====
2025/01/08 12:33:40 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 84.0) from minibatch trials...



Average Metric: 21.00 / 34 (61.8%):  42%|███████████████████▌                          | 34/80 [00:18<00:26,  1.76it/s]

2025/01/08 12:34:00 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'what ? that is white ? i said #whitelivesmatter too', 'hate_class': 'implicit_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 43.00 / 69 (62.3%):  88%|████████████████████████████████████████▎     | 70/80 [00:42<00:07,  1.34it/s]

2025/01/08 12:34:23 ERROR dspy.utils.parallelizer: Error processing item Example({'post': "do i think mlk was wrong for calling out the founding fathers for being white supremacists in his book ... and saying too many whites are not willing to go the extra mile to end white supremacy ... no ... i don't", 'hate_class': 'not_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 51.00 / 78 (65.4%): 100%|██████████████████████████████████████████████| 80/80 [00:48<00:00,  1.65it/s]

2025/01/08 12:34:29 INFO dspy.evaluate.evaluate: Average Metric: 51.0 / 80 (63.8%)
2025/01/08 12:34:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:34:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0
2025/01/08 12:34:29 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/08 12:34:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:05<00:00,  4.98it/s]

2025/01/08 12:34:34 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 8'].
2025/01/08 12:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0]
2025/01/08 12:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 50 ==



Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████| 25/25 [00:14<00:00,  1.67it/s]

2025/01/08 12:34:49 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)
2025/01/08 12:34:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 34', 'Predictor 0: Few-Shot Set 9'].
2025/01/08 12:34:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0]
2025/01/08 12:34:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:34:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:34:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.04it/s]

2025/01/08 12:35:13 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:35:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 32'].
2025/01/08 12:35:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0]
2025/01/08 12:35:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:35:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:35:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 50 ==



Average Metric: 8.00 / 10 (80.0%):  40%|██████████████████▊                            | 10/25 [00:10<00:12,  1.22it/s]

2025/01/08 12:35:24 ERROR dspy.utils.parallelizer: Error processing item Example({'post': ' that\'s why i hate to see infighting amongst members of the  " alt-lite " "  alt right  far right  etc . they "', 'hate_class': 'not_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 19.00 / 24 (79.2%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.10it/s]

2025/01/08 12:35:36 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 25 (76.0%)
2025/01/08 12:35:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2025/01/08 12:35:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0]
2025/01/08 12:35:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:35:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:35:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.09it/s]

2025/01/08 12:35:59 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 37', 'Predictor 0: Few-Shot Set 27'].
2025/01/08 12:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0]
2025/01/08 12:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 26 / 50 ==



Average Metric: 8.00 / 12 (66.7%):  44%|████████████████████▋                          | 11/25 [00:09<00:13,  1.04it/s]

2025/01/08 12:36:09 ERROR dspy.utils.parallelizer: Error processing item Example({'post': "if the alt right hadn't alienated the oath keepers who had to step up and be the adults ...", 'hate_class': 'not_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 15.00 / 24 (62.5%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.09it/s]

2025/01/08 12:36:21 INFO dspy.evaluate.evaluate: Average Metric: 15.0 / 25 (60.0%)
2025/01/08 12:36:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 29', 'Predictor 0: Few-Shot Set 22'].
2025/01/08 12:36:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0]
2025/01/08 12:36:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:36:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:36:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 27 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:04<00:00,  5.02it/s]

2025/01/08 12:36:27 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/08 12:36:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 15'].
2025/01/08 12:36:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0]
2025/01/08 12:36:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:36:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:36:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 28 / 50 ==



Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.04it/s]

2025/01/08 12:36:51 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)
2025/01/08 12:36:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 30', 'Predictor 0: Few-Shot Set 23'].
2025/01/08 12:36:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0]
2025/01/08 12:36:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:36:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:36:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 29 / 50 ==



Average Metric: 8.00 / 10 (80.0%):  40%|██████████████████▊                            | 10/25 [00:09<00:14,  1.05it/s]

2025/01/08 12:37:02 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'mr . those who fell into the hands of islamic invaders had only two choices - either convert or get killed .', 'hate_class': 'implicit_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 17.00 / 24 (70.8%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.11it/s]

2025/01/08 12:37:13 INFO dspy.evaluate.evaluate: Average Metric: 17.0 / 25 (68.0%)
2025/01/08 12:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 30', 'Predictor 0: Few-Shot Set 12'].
2025/01/08 12:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0]
2025/01/08 12:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 30 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.08it/s]

2025/01/08 12:37:36 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:37:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 31', 'Predictor 0: Few-Shot Set 14'].
2025/01/08 12:37:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0]
2025/01/08 12:37:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75]
2025/01/08 12:37:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/01/08 12:37:36 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====
2025/01/08 12:37:36 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 84.0) from minibatch trials...



Average Metric: 59.00 / 80 (73.8%): 100%|██████████████████████████████████████████████| 80/80 [00:48<00:00,  1.65it/s]

2025/01/08 12:38:25 INFO dspy.evaluate.evaluate: Average Metric: 59 / 80 (73.8%)
2025/01/08 12:38:25 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 73.75
2025/01/08 12:38:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:38:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75
2025/01/08 12:38:25 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/08 12:38:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 31 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.06it/s]

2025/01/08 12:38:49 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:38:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 20', 'Predictor 0: Few-Shot Set 18'].
2025/01/08 12:38:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0]
2025/01/08 12:38:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:38:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:38:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 32 / 50 ==



Average Metric: 13.00 / 25 (52.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.13it/s]

2025/01/08 12:39:11 INFO dspy.evaluate.evaluate: Average Metric: 13 / 25 (52.0%)
2025/01/08 12:39:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 52.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 21'].
2025/01/08 12:39:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0]
2025/01/08 12:39:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:39:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:39:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 33 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.06it/s]

2025/01/08 12:39:35 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 33', 'Predictor 0: Few-Shot Set 23'].
2025/01/08 12:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0]
2025/01/08 12:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 34 / 50 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.24it/s]

2025/01/08 12:39:55 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/08 12:39:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 30', 'Predictor 0: Few-Shot Set 8'].
2025/01/08 12:39:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0]
2025/01/08 12:39:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:39:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:39:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 35 / 50 ==



Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.11it/s]

2025/01/08 12:40:18 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)
2025/01/08 12:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 25'].
2025/01/08 12:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0]
2025/01/08 12:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:40:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 36 / 50 ==



Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.02s/it]

2025/01/08 12:40:43 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)
2025/01/08 12:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 17'].
2025/01/08 12:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0]
2025/01/08 12:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 37 / 50 ==



Average Metric: 19.00 / 25 (76.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.10it/s]

2025/01/08 12:41:06 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)
2025/01/08 12:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 25'].
2025/01/08 12:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0]
2025/01/08 12:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 38 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:19<00:00,  1.26it/s]

2025/01/08 12:41:26 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 20'].
2025/01/08 12:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0]
2025/01/08 12:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 39 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.16it/s]

2025/01/08 12:41:48 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:41:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23', 'Predictor 0: Few-Shot Set 28'].
2025/01/08 12:41:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0]
2025/01/08 12:41:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:41:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:41:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 40 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.21it/s]

2025/01/08 12:42:09 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/08 12:42:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 35', 'Predictor 0: Few-Shot Set 31'].
2025/01/08 12:42:09 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0]
2025/01/08 12:42:09 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75]
2025/01/08 12:42:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:42:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 4 =====
2025/01/08 12:42:09 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 84.0


Average Metric: 57.00 / 80 (71.2%): 100%|██████████████████████████████████████████████| 80/80 [00:49<00:00,  1.63it/s]

2025/01/08 12:42:58 INFO dspy.evaluate.evaluate: Average Metric: 57 / 80 (71.2%)
2025/01/08 12:42:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:42:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75
2025/01/08 12:42:58 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/08 12:42:58 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 41 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.03it/s]

2025/01/08 12:43:22 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/08 12:43:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 28', 'Predictor 0: Few-Shot Set 5'].
2025/01/08 12:43:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0]
2025/01/08 12:43:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:43:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:43:22 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 42 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:20<00:00,  1.21it/s]

2025/01/08 12:43:43 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/08 12:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 22', 'Predictor 0: Few-Shot Set 33'].
2025/01/08 12:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0]
2025/01/08 12:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 43 / 50 ==



Average Metric: 20.00 / 25 (80.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.01s/it]

2025/01/08 12:44:08 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)
2025/01/08 12:44:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 6'].
2025/01/08 12:44:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0]
2025/01/08 12:44:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:44:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:44:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 44 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:15<00:00,  1.66it/s]

2025/01/08 12:44:23 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/08 12:44:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 17'].
2025/01/08 12:44:23 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0, 56.0]
2025/01/08 12:44:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:44:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:44:23 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 45 / 50 ==



Average Metric: 9.00 / 17 (52.9%):  68%|███████████████████████████████▉               | 17/25 [00:12<00:04,  1.92it/s]

2025/01/08 12:44:36 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'bbc fires award-winning white male comedian for being white and male', 'hate_class': 'implicit_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 15.00 / 24 (62.5%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.19it/s]

2025/01/08 12:44:44 INFO dspy.evaluate.evaluate: Average Metric: 15.0 / 25 (60.0%)
2025/01/08 12:44:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 19'].
2025/01/08 12:44:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0, 56.0, 60.0]
2025/01/08 12:44:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:44:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:44:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 46 / 50 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████████| 25/25 [00:05<00:00,  4.99it/s]

2025/01/08 12:44:49 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/08 12:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 25'].
2025/01/08 12:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0, 56.0, 60.0, 72.0]
2025/01/08 12:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 47 / 50 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████████| 25/25 [00:05<00:00,  4.98it/s]

2025/01/08 12:44:54 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/08 12:44:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 30', 'Predictor 0: Few-Shot Set 23'].
2025/01/08 12:44:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0, 56.0, 60.0, 72.0, 68.0]
2025/01/08 12:44:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:44:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:44:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 48 / 50 ==



Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████| 25/25 [00:19<00:00,  1.26it/s]

2025/01/08 12:45:14 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)
2025/01/08 12:45:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 36', 'Predictor 0: Few-Shot Set 3'].
2025/01/08 12:45:14 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0, 56.0, 60.0, 72.0, 68.0, 84.0]
2025/01/08 12:45:14 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:45:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:45:14 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 49 / 50 ==



Average Metric: 11.00 / 14 (78.6%):  56%|█████████████████████████▊                    | 14/25 [00:13<00:10,  1.03it/s]

2025/01/08 12:45:28 ERROR dspy.utils.parallelizer: Error processing item Example({'post': "racist and supremacist when it's convenient for you.fuck you otherwise  can't help", 'hate_class': 'not_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 16.00 / 24 (66.7%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.13it/s]

2025/01/08 12:45:37 INFO dspy.evaluate.evaluate: Average Metric: 16.0 / 25 (64.0%)
2025/01/08 12:45:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 34'].
2025/01/08 12:45:37 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0, 56.0, 60.0, 72.0, 68.0, 84.0, 64.0]
2025/01/08 12:45:37 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:45:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:45:37 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 50 / 50 ==



Average Metric: 10.00 / 18 (55.6%):  72%|█████████████████████████████████             | 18/25 [00:15<00:05,  1.19it/s]

2025/01/08 12:45:52 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'didnt blue lives matter just a second ago ? oh thats right  cops kill white people for no reason too sometimes . youd think you just found this out ? or are you all just bigots ?', 'hate_class': 'implicit_hate'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 15.00 / 24 (62.5%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.14it/s]

2025/01/08 12:45:59 INFO dspy.evaluate.evaluate: Average Metric: 15.0 / 25 (60.0%)
2025/01/08 12:45:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 37'].
2025/01/08 12:45:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [76.0, 80.0, 68.0, 56.0, 72.0, 60.0, 80.0, 80.0, 76.0, 64.0, 68.0, 64.0, 48.0, 68.0, 72.0, 60.0, 80.0, 72.0, 84.0, 68.0, 68.0, 84.0, 68.0, 76.0, 68.0, 60.0, 56.0, 84.0, 68.0, 68.0, 68.0, 52.0, 68.0, 72.0, 84.0, 84.0, 76.0, 68.0, 68.0, 64.0, 64.0, 64.0, 80.0, 56.0, 60.0, 72.0, 68.0, 84.0, 64.0, 60.0]
2025/01/08 12:45:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25]
2025/01/08 12:45:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75


2025/01/08 12:45:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 5 =====
2025/01/08 12:45:59 INFO dspy.teleprompt.mipro_optimiz


Average Metric: 57.00 / 80 (71.2%): 100%|██████████████████████████████████████████████| 80/80 [00:51<00:00,  1.57it/s]

2025/01/08 12:46:50 INFO dspy.evaluate.evaluate: Average Metric: 57 / 80 (71.2%)
2025/01/08 12:46:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.0, 70.0, 63.75, 73.75, 71.25, 71.25]
2025/01/08 12:46:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.75
2025/01/08 12:46:50 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/08 12:46:50 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 73.75!





In [18]:
optimized_classify_hate.save(optimizers_folder + model_name + '_' + 'optimizer_stg1.json')

### Run optimized classifier

In [21]:
optimizer_file_name = optimizers_folder + model_name + '_' + 'optimizer_stg1.json'
result_file = result_folder + model_name + "_" + "result_optimizer_stg1.csv"
df_hate = pd.read_csv(data_folder + "implicit_hate_test_stg1.csv", encoding="utf-8")

In [22]:
if debug_mode:
    df_hate = df_hate.sample(2)

In [23]:
optimized_classify_hate = classify_hate
optimized_classify_hate.load(optimizer_file_name)
optimized_classify_stage_1(optimized_classify_hate, df_hate, result_file, True)

# Stage 2: Fine-Grained Implicit Hate

In [24]:
def optimized_classify_stage_2(optimizer, df_hate, result_file, initialize = False):
    
    if initialize:
        # Regenerate output file
        df = pd.DataFrame(columns = ['post_id', 'implicit_class','confidence','explanation'])
        df.to_csv(result_file, encoding = "utf-8", index = False)
    else:
        df_elab = pd.read_csv(result_file, encoding = "utf-8")
        df_hate = df_hate[~df_hate['post_id'].isin(df_elab['post_id'])]

    for idx, row in df_hate.iterrows():
    
        try:            
            time.sleep(wait_time)
            classification = optimizer(post = row['post'])
        
            # Classification stage 2
            new_row = {
                'post_id': row['post_id'], 
                'implicit_class': classification.implicit_class, 
                'confidence': classification.confidence,
                'explanation': classification.explanation
            }
            df = pd.DataFrame([new_row])
            df.to_csv(result_file, encoding = "utf-8", mode='a', index=False, header=False)
                      
        except:
          print("An exception occurred " +  str(row['post_id']) ) 

### Signature

In [25]:
with open(prompt_folder + 'prompt_stg2.txt') as f:
    prompt_stg2 = f.read()

In [26]:
prompt_tot_stg2 = prompt_stg2.replace("{tree_of_thoughts}", tree_of_thoughts)

In [27]:
print(prompt_tot_stg2)

The following text has been categorized as "implicit hate speech", a more complex and subtle attack on a person or a group of people based on their protected characteristics without relying on explicit keywords.

Evaluate the text against the following taxonomy to determine the most likely class.
Class taxonomy of implicit hate speech:

1. White Grievance: includes frustration over a minority group's perceived privilege and casting majority groups as the real victims of racism. This language is linked to extremist behavior and support for violence. An example is 'Black lives matter and white lives don't? Sounds racist.'
2. Incitement to Violence: includes flaunting ingroup unity and power or elevating known hate groups and ideologies. Phrases like 'white brotherhood' operate in the former manner, while statements like 'Hitler was Germany â€“ Germans shall rise again!' operate in the latter, elevating nationalism and Nazism. Article 20 of the UN International Covenant on Civil and Polit

In [28]:
class ClassifyImplicitHate(dspy.Signature):
    prompt_tot_stg2

    post: str = dspy.InputField()
    implicit_class: Literal['white_grievance', 'irony', 'stereotypical', 'incitement', 'other', 'threatening', 'inferiority', 'other'] = dspy.OutputField()
    confidence: float = dspy.OutputField()
    explanation : str = dspy.OutputField()

classify_implicit_hate = dspy.Predict(ClassifyImplicitHate)

### Metric

In [29]:
def classify_implicit_hate_metric(example, prediction, trace=None):
    time.sleep(wait_time)
    return prediction.implicit_class == example.implicit_class

### Trainset

In [30]:
df_train_stg2 = pd.read_csv(data_folder + "implicit_hate_train_stg2.csv", encoding="utf-8")

In [31]:
len(df_train_stg2)

1000

In [32]:
df_train_stg2['implicit_class'].unique()

array(['inferiority', 'incitement', 'irony', 'white_grievance',
       'threatening', 'stereotypical', 'other'], dtype=object)

In [33]:
# Select n samples for each class
n = 50
df_train = pd.DataFrame()
for implicit_class in df_train_stg2['implicit_class'].unique():
    df_class = df_train_stg2[df_train_stg2['implicit_class'] == implicit_class]
    df_class = df_class.sample(np.min((n,len(df_class))))
    df_train = pd.concat([df_train, df_class])

In [34]:
trainset = []
for index, row in df_train.iterrows():
    example = dspy.Example(post = row['post'], implicit_class = row['implicit_class']).with_inputs("post")
    trainset.append(example)

### Evaluator

In [41]:
evaluator = Evaluate(devset = trainset, num_threads = 1, display_progress = True, display_table = 5)
evaluator(classify_implicit_hate, metric = classify_implicit_hate_metric)

Average Metric: 105.00 / 313 (33.5%): 100%|██████████████████████████████████████████| 313/313 [05:14<00:00,  1.00s/it]

2025/01/08 17:33:25 INFO dspy.evaluate.evaluate: Average Metric: 105 / 313 (33.5%)





Unnamed: 0,post,example_implicit_class,pred_implicit_class,confidence,explanation,classify_implicit_hate_metric
0,manages to maintain it's culture. guess it's well out the way of t...,inferiority,stereotypical,0.75,The post suggests that a certain culture is maintained because it ...,
1,libertarian solution : more third world immigration,inferiority,irony,0.85,The post appears to be using irony to highlight a contrast between...,
2,why do colored people eagerly run to white countries to be oppress...,inferiority,white_grievance,0.95,"The post suggests a grievance from a white perspective, implying t...",
3,white power black parasite. blacks have contributed nothing to hum...,inferiority,incitement,0.95,The post contains explicit racist language and derogatory remarks ...,
4,i have seen this woman on youtube she is fantastic and so knowledg...,inferiority,stereotypical,0.85,The post stereotypes Muslims by referring to their presence in the...,


33.55

### Optimization with MIPRO (prompt and examples)

In [15]:
print('Samples in trainset: {}'.format(len(trainset)))

classify_implicit_hate_optimizer = dspy.MIPROv2(
    metric = classify_implicit_hate_metric, 
    auto = "heavy",
    num_threads = 2
)

optimized_classify_implicit_hate = classify_implicit_hate_optimizer.compile(
    classify_implicit_hate, 
    trainset = trainset, 
    max_labeled_demos = 20, 
    max_bootstrapped_demos = 20
)

2025/01/10 16:14:50 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING HEAVY AUTO RUN SETTINGS:
num_trials: 50
minibatch: True
num_candidates: 38
valset size: 250



[93m[1mProjected Language Model (LM) Calls[0m

Based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Prompt Generation: [94m[1m10[0m[93m data summarizer calls + [94m[1m38[0m[93m * [94m[1m1[0m[93m lm calls in program + ([94m[1m2[0m[93m) lm calls in program-aware proposer = [94m[1m50[0m[93m prompt model calls[0m
[93m- Program Evaluation: [94m[1m25[0m[93m examples in minibatch * [94m[1m50[0m[93m batches + [94m[1m250[0m[93m examples in val set * [94m[1m6[0m[93m full evals = [94m[1m2750[0m[93m LM Program calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Mod

2025/01/10 16:14:54 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/10 16:14:54 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/10 16:14:54 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=38 sets of demonstrations...


Bootstrapping set 1/38
Bootstrapping set 2/38
Bootstrapping set 3/38


 52%|██████████████████████████████████████████▉                                       | 33/63 [04:23<03:59,  7.97s/it]


Bootstrapped 20 full traces after 33 examples for up to 1 rounds, amounting to 33 attempts.
Bootstrapping set 4/38


 17%|██████████████▎                                                                   | 11/63 [01:24<06:38,  7.65s/it]


Bootstrapped 4 full traces after 11 examples for up to 1 rounds, amounting to 11 attempts.
Bootstrapping set 5/38


 44%|████████████████████████████████████▍                                             | 28/63 [03:40<04:35,  7.88s/it]


Bootstrapped 14 full traces after 28 examples for up to 1 rounds, amounting to 28 attempts.
Bootstrapping set 6/38


 11%|█████████▏                                                                         | 7/63 [01:26<11:32, 12.37s/it]


Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 7/38


 43%|███████████████████████████████████▏                                              | 27/63 [02:55<03:54,  6.50s/it]


Bootstrapped 15 full traces after 27 examples for up to 1 rounds, amounting to 27 attempts.
Bootstrapping set 8/38


 25%|████████████████████▊                                                             | 16/63 [02:18<06:45,  8.63s/it]


Bootstrapped 8 full traces after 16 examples for up to 1 rounds, amounting to 16 attempts.
Bootstrapping set 9/38


 37%|█████████████████████████████▉                                                    | 23/63 [03:29<06:04,  9.12s/it]


Bootstrapped 11 full traces after 23 examples for up to 1 rounds, amounting to 23 attempts.
Bootstrapping set 10/38


 48%|███████████████████████████████████████                                           | 30/63 [04:32<04:59,  9.09s/it]


Bootstrapped 14 full traces after 30 examples for up to 1 rounds, amounting to 30 attempts.
Bootstrapping set 11/38


 60%|█████████████████████████████████████████████████▍                                | 38/63 [03:44<02:27,  5.90s/it]


Bootstrapped 14 full traces after 38 examples for up to 1 rounds, amounting to 38 attempts.
Bootstrapping set 12/38


 29%|███████████████████████▍                                                          | 18/63 [01:17<03:14,  4.32s/it]


Bootstrapped 13 full traces after 18 examples for up to 1 rounds, amounting to 18 attempts.
Bootstrapping set 13/38


 43%|███████████████████████████████████▏                                              | 27/63 [03:04<04:06,  6.83s/it]


Bootstrapped 13 full traces after 27 examples for up to 1 rounds, amounting to 27 attempts.
Bootstrapping set 14/38


 16%|█████████████                                                                     | 10/63 [01:38<08:42,  9.86s/it]


Bootstrapped 5 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.
Bootstrapping set 15/38


 14%|███████████▊                                                                       | 9/63 [01:30<09:04, 10.09s/it]


Bootstrapped 4 full traces after 9 examples for up to 1 rounds, amounting to 9 attempts.
Bootstrapping set 16/38


 40%|████████████████████████████████▌                                                 | 25/63 [04:23<06:40, 10.53s/it]


Bootstrapped 11 full traces after 25 examples for up to 1 rounds, amounting to 25 attempts.
Bootstrapping set 17/38


 33%|███████████████████████████▎                                                      | 21/63 [02:15<04:30,  6.45s/it]


Bootstrapped 13 full traces after 21 examples for up to 1 rounds, amounting to 21 attempts.
Bootstrapping set 18/38


 44%|████████████████████████████████████▍                                             | 28/63 [02:53<03:36,  6.18s/it]


Bootstrapped 18 full traces after 28 examples for up to 1 rounds, amounting to 28 attempts.
Bootstrapping set 19/38


 27%|██████████████████████▏                                                           | 17/63 [01:42<04:38,  6.05s/it]


Bootstrapped 6 full traces after 17 examples for up to 1 rounds, amounting to 17 attempts.
Bootstrapping set 20/38


  6%|█████▎                                                                             | 4/63 [01:01<15:00, 15.26s/it]


Bootstrapped 1 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 21/38


 17%|██████████████▎                                                                   | 11/63 [01:28<06:56,  8.01s/it]


Bootstrapped 5 full traces after 11 examples for up to 1 rounds, amounting to 11 attempts.
Bootstrapping set 22/38


 25%|████████████████████▊                                                             | 16/63 [01:53<05:32,  7.07s/it]


Bootstrapped 8 full traces after 16 examples for up to 1 rounds, amounting to 16 attempts.
Bootstrapping set 23/38


 54%|████████████████████████████████████████████▎                                     | 34/63 [05:41<04:51, 10.05s/it]


Bootstrapped 18 full traces after 34 examples for up to 1 rounds, amounting to 34 attempts.
Bootstrapping set 24/38


 40%|████████████████████████████████▌                                                 | 25/63 [02:55<04:26,  7.01s/it]


Bootstrapped 13 full traces after 25 examples for up to 1 rounds, amounting to 25 attempts.
Bootstrapping set 25/38


 19%|███████████████▌                                                                  | 12/63 [01:04<04:32,  5.34s/it]


Bootstrapped 7 full traces after 12 examples for up to 1 rounds, amounting to 12 attempts.
Bootstrapping set 26/38


 29%|███████████████████████▍                                                          | 18/63 [01:31<03:47,  5.07s/it]


Bootstrapped 13 full traces after 18 examples for up to 1 rounds, amounting to 18 attempts.
Bootstrapping set 27/38


 29%|███████████████████████▍                                                          | 18/63 [02:14<05:37,  7.49s/it]


Bootstrapped 10 full traces after 18 examples for up to 1 rounds, amounting to 18 attempts.
Bootstrapping set 28/38


 33%|███████████████████████████▎                                                      | 21/63 [02:02<04:05,  5.85s/it]


Bootstrapped 13 full traces after 21 examples for up to 1 rounds, amounting to 21 attempts.
Bootstrapping set 29/38


  2%|█▎                                                                                 | 1/63 [00:12<12:47, 12.37s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 30/38


 14%|███████████▊                                                                       | 9/63 [01:19<07:59,  8.87s/it]


Bootstrapped 5 full traces after 9 examples for up to 1 rounds, amounting to 9 attempts.
Bootstrapping set 31/38


 48%|███████████████████████████████████████                                           | 30/63 [04:29<04:56,  8.98s/it]


Bootstrapped 19 full traces after 30 examples for up to 1 rounds, amounting to 30 attempts.
Bootstrapping set 32/38


 21%|████████████████▉                                                                 | 13/63 [01:26<05:32,  6.66s/it]


Bootstrapped 7 full traces after 13 examples for up to 1 rounds, amounting to 13 attempts.
Bootstrapping set 33/38


 60%|█████████████████████████████████████████████████▍                                | 38/63 [05:59<03:56,  9.47s/it]


Bootstrapped 19 full traces after 38 examples for up to 1 rounds, amounting to 38 attempts.
Bootstrapping set 34/38


 37%|█████████████████████████████▉                                                    | 23/63 [03:17<05:44,  8.61s/it]


Bootstrapped 9 full traces after 23 examples for up to 1 rounds, amounting to 23 attempts.
Bootstrapping set 35/38


 49%|████████████████████████████████████████▎                                         | 31/63 [03:46<03:53,  7.29s/it]


Bootstrapped 13 full traces after 31 examples for up to 1 rounds, amounting to 31 attempts.
Bootstrapping set 36/38


 40%|████████████████████████████████▌                                                 | 25/63 [02:40<04:04,  6.43s/it]


Bootstrapped 19 full traces after 25 examples for up to 1 rounds, amounting to 25 attempts.
Bootstrapping set 37/38


 63%|████████████████████████████████████████████████████                              | 40/63 [05:08<02:57,  7.71s/it]


Bootstrapped 15 full traces after 40 examples for up to 1 rounds, amounting to 40 attempts.
Bootstrapping set 38/38


  8%|██████▌                                                                            | 5/63 [00:20<03:59,  4.12s/it]
2025/01/10 17:50:09 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/10 17:50:09 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.


2025/01/10 17:51:21 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/01/10 18:03:17 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/01/10 18:03:17 INFO dspy.teleprompt.mipro_optimizer_v2: 0: The following text has been categorized as "implicit hate speech", a more complex and subtle attack on a person or a group of people based on their protected characteristics without relying on explicit keywords.

Evaluate the text against the following taxonomy to determine the most likely class.
Class taxonomy of implicit hate speech:

1. White Grievance: includes frustration over a minority group's perceived privilege and casting majority groups as the real victims of racism. This language is linked to extremist behavior and support for violence. An example is 'Black lives matter and white lives don't? Sounds racist.'
2. Incitement to Violence: includes flaunting ingroup unity and power or elevating known hate groups and ideologies. P

Average Metric: 127.00 / 250 (50.8%): 100%|██████████████████████████████████████████| 250/250 [03:58<00:00,  1.05it/s]

2025/01/10 18:07:15 INFO dspy.evaluate.evaluate: Average Metric: 127 / 250 (50.8%)
2025/01/10 18:07:15 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 50.8

2025/01/10 18:07:15 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2025/01/10 18:07:15 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2025/01/10 18:07:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.02it/s]

2025/01/10 18:07:40 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:07:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 26', 'Predictor 0: Few-Shot Set 10'].
2025/01/10 18:07:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0]
2025/01/10 18:07:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:07:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:07:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 50 ==



Average Metric: 15.00 / 25 (60.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.07it/s]

2025/01/10 18:08:03 INFO dspy.evaluate.evaluate: Average Metric: 15 / 25 (60.0%)
2025/01/10 18:08:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 15'].
2025/01/10 18:08:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0]
2025/01/10 18:08:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:08:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:08:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 50 ==



Average Metric: 8.00 / 25 (32.0%): 100%|███████████████████████████████████████████████| 25/25 [00:25<00:00,  1.01s/it]

2025/01/10 18:08:29 INFO dspy.evaluate.evaluate: Average Metric: 8 / 25 (32.0%)
2025/01/10 18:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 32.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 20'].
2025/01/10 18:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0]
2025/01/10 18:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 50 ==



Average Metric: 9.00 / 25 (36.0%): 100%|███████████████████████████████████████████████| 25/25 [00:25<00:00,  1.01s/it]

2025/01/10 18:08:54 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)
2025/01/10 18:08:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 11'].
2025/01/10 18:08:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0]
2025/01/10 18:08:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:08:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:08:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.02s/it]

2025/01/10 18:09:20 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/10 18:09:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 20'].
2025/01/10 18:09:20 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0]
2025/01/10 18:09:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:09:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:09:20 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.08it/s]

2025/01/10 18:09:43 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/10 18:09:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 21', 'Predictor 0: Few-Shot Set 0'].
2025/01/10 18:09:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0]
2025/01/10 18:09:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:09:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:09:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.00it/s]

2025/01/10 18:10:08 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/10 18:10:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 34', 'Predictor 0: Few-Shot Set 9'].
2025/01/10 18:10:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0]
2025/01/10 18:10:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:10:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:10:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:26<00:00,  1.04s/it]

2025/01/10 18:10:34 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/10 18:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 32', 'Predictor 0: Few-Shot Set 3'].
2025/01/10 18:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0]
2025/01/10 18:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 50 ==



Average Metric: 7.00 / 25 (28.0%): 100%|███████████████████████████████████████████████| 25/25 [00:26<00:00,  1.06s/it]

2025/01/10 18:11:01 INFO dspy.evaluate.evaluate: Average Metric: 7 / 25 (28.0%)
2025/01/10 18:11:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 21', 'Predictor 0: Few-Shot Set 26'].
2025/01/10 18:11:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0]
2025/01/10 18:11:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:11:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:11:01 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 50 ==



Average Metric: 4.00 / 8 (50.0%):  32%|███████████████▋                                 | 8/25 [00:10<00:17,  1.02s/it]

2025/01/10 18:11:12 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'does not make you a racist . makes you a realist .', 'implicit_class': 'white_grievance'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 12.00 / 24 (50.0%): 100%|██████████████████████████████████████████████| 25/25 [00:26<00:00,  1.04s/it]

2025/01/10 18:11:27 INFO dspy.evaluate.evaluate: Average Metric: 12.0 / 25 (48.0%)
2025/01/10 18:11:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 24'].
2025/01/10 18:11:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0]
2025/01/10 18:11:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8]
2025/01/10 18:11:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:11:27 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====
2025/01/10 18:11:27 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 60.0) from minibatch trials...



Average Metric: 123.00 / 250 (49.2%): 100%|██████████████████████████████████████████| 250/250 [03:27<00:00,  1.20it/s]

2025/01/10 18:14:55 INFO dspy.evaluate.evaluate: Average Metric: 123 / 250 (49.2%)
2025/01/10 18:14:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:14:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8
2025/01/10 18:14:55 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/10 18:14:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 50 ==



Average Metric: 15.00 / 25 (60.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.03s/it]

2025/01/10 18:15:21 INFO dspy.evaluate.evaluate: Average Metric: 15 / 25 (60.0%)
2025/01/10 18:15:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 16'].
2025/01/10 18:15:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0]
2025/01/10 18:15:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:15:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:15:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 50 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.04it/s]

2025/01/10 18:15:45 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/10 18:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 17'].
2025/01/10 18:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0]
2025/01/10 18:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:26<00:00,  1.04s/it]

2025/01/10 18:16:11 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/10 18:16:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24', 'Predictor 0: Few-Shot Set 15'].
2025/01/10 18:16:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0]
2025/01/10 18:16:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:16:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:16:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 50 ==



Average Metric: 9.00 / 25 (36.0%): 100%|███████████████████████████████████████████████| 25/25 [00:24<00:00,  1.03it/s]

2025/01/10 18:16:35 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)
2025/01/10 18:16:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24', 'Predictor 0: Few-Shot Set 13'].
2025/01/10 18:16:35 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0]
2025/01/10 18:16:35 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:16:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:16:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.00s/it]

2025/01/10 18:17:00 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/10 18:17:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 17'].
2025/01/10 18:17:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0]
2025/01/10 18:17:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:17:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:17:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.04it/s]

2025/01/10 18:17:25 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/10 18:17:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 17'].
2025/01/10 18:17:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0]
2025/01/10 18:17:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:17:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:17:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 50 ==



Average Metric: 6.00 / 25 (24.0%): 100%|███████████████████████████████████████████████| 25/25 [00:28<00:00,  1.13s/it]

2025/01/10 18:17:53 INFO dspy.evaluate.evaluate: Average Metric: 6 / 25 (24.0%)
2025/01/10 18:17:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 24.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 15'].
2025/01/10 18:17:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0]
2025/01/10 18:17:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:17:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:17:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 50 ==



Average Metric: 10.00 / 25 (40.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.01it/s]

2025/01/10 18:18:18 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)
2025/01/10 18:18:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 33', 'Predictor 0: Few-Shot Set 2'].
2025/01/10 18:18:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0]
2025/01/10 18:18:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:18:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:18:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 50 ==



Average Metric: 15.00 / 25 (60.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.09it/s]

2025/01/10 18:18:40 INFO dspy.evaluate.evaluate: Average Metric: 15 / 25 (60.0%)
2025/01/10 18:18:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24', 'Predictor 0: Few-Shot Set 15'].
2025/01/10 18:18:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0]
2025/01/10 18:18:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:18:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:18:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.08it/s]

2025/01/10 18:19:04 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/10 18:19:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 17'].
2025/01/10 18:19:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0]
2025/01/10 18:19:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2]
2025/01/10 18:19:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 50.8


2025/01/10 18:19:04 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====
2025/01/10 18:19:04 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 64.0) from minibatch trials...



Average Metric: 132.00 / 250 (52.8%): 100%|██████████████████████████████████████████| 250/250 [03:02<00:00,  1.37it/s]

2025/01/10 18:22:07 INFO dspy.evaluate.evaluate: Average Metric: 132 / 250 (52.8%)
2025/01/10 18:22:07 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 52.8
2025/01/10 18:22:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:22:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8
2025/01/10 18:22:07 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/10 18:22:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.03it/s]

2025/01/10 18:22:31 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/10 18:22:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 37'].
2025/01/10 18:22:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0]
2025/01/10 18:22:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:22:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:22:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.05it/s]

2025/01/10 18:22:55 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:22:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 15'].
2025/01/10 18:22:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0]
2025/01/10 18:22:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:22:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:22:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.03it/s]

2025/01/10 18:23:20 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/10 18:23:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 18'].
2025/01/10 18:23:20 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0]
2025/01/10 18:23:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:23:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:23:20 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.00s/it]

2025/01/10 18:23:45 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:23:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 18'].
2025/01/10 18:23:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0]
2025/01/10 18:23:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:23:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:23:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.02s/it]

2025/01/10 18:24:11 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:24:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 30'].
2025/01/10 18:24:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0]
2025/01/10 18:24:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:24:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:24:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 26 / 50 ==



Average Metric: 10.00 / 25 (40.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.06it/s]

2025/01/10 18:24:34 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)
2025/01/10 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 17', 'Predictor 0: Few-Shot Set 18'].
2025/01/10 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0]
2025/01/10 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 27 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.11it/s]

2025/01/10 18:24:57 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:24:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 34'].
2025/01/10 18:24:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0]
2025/01/10 18:24:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:24:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:24:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 28 / 50 ==



Average Metric: 9.00 / 25 (36.0%): 100%|███████████████████████████████████████████████| 25/25 [00:28<00:00,  1.15s/it]

2025/01/10 18:25:26 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)
2025/01/10 18:25:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 30', 'Predictor 0: Few-Shot Set 23'].
2025/01/10 18:25:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0]
2025/01/10 18:25:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:25:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:25:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 29 / 50 ==



Average Metric: 13.00 / 25 (52.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.05it/s]

2025/01/10 18:25:50 INFO dspy.evaluate.evaluate: Average Metric: 13 / 25 (52.0%)
2025/01/10 18:25:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 52.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 29'].
2025/01/10 18:25:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0]
2025/01/10 18:25:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:25:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:25:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 30 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.02it/s]

2025/01/10 18:26:14 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/10 18:26:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 37', 'Predictor 0: Few-Shot Set 27'].
2025/01/10 18:26:14 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0]
2025/01/10 18:26:14 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8]
2025/01/10 18:26:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:26:14 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====
2025/01/10 18:26:14 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 64.0) from minibatch trials...



Average Metric: 66.00 / 155 (42.6%):  62%|██████████████████████████▋                | 155/250 [02:06<01:26,  1.10it/s]

2025/01/10 18:28:21 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'Everyone is so politically correct these days... You can\'t even say "black paint!" You have to say "tyrone can you paint that wall please?"', 'implicit_class': 'irony'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 117.00 / 249 (47.0%): 100%|██████████████████████████████████████████| 250/250 [03:19<00:00,  1.25it/s]

2025/01/10 18:29:34 INFO dspy.evaluate.evaluate: Average Metric: 117.0 / 250 (46.8%)
2025/01/10 18:29:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:29:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8
2025/01/10 18:29:34 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/10 18:29:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 31 / 50 ==



Average Metric: 12.00 / 25 (48.0%): 100%|██████████████████████████████████████████████| 25/25 [00:27<00:00,  1.09s/it]

2025/01/10 18:30:02 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)
2025/01/10 18:30:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 48.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 14'].
2025/01/10 18:30:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0]
2025/01/10 18:30:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:30:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:30:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 32 / 50 ==



Average Metric: 8.00 / 25 (32.0%): 100%|███████████████████████████████████████████████| 25/25 [00:25<00:00,  1.03s/it]

2025/01/10 18:30:28 INFO dspy.evaluate.evaluate: Average Metric: 8 / 25 (32.0%)
2025/01/10 18:30:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 32.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 21'].
2025/01/10 18:30:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0]
2025/01/10 18:30:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:30:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:30:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 33 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:21<00:00,  1.15it/s]

2025/01/10 18:30:49 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23', 'Predictor 0: Few-Shot Set 19'].
2025/01/10 18:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0]
2025/01/10 18:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:30:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 34 / 50 ==



Average Metric: 8.00 / 25 (32.0%): 100%|███████████████████████████████████████████████| 25/25 [00:23<00:00,  1.08it/s]

2025/01/10 18:31:13 INFO dspy.evaluate.evaluate: Average Metric: 8 / 25 (32.0%)
2025/01/10 18:31:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 32.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 20', 'Predictor 0: Few-Shot Set 22'].
2025/01/10 18:31:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0]
2025/01/10 18:31:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:31:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:31:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 35 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.01s/it]

2025/01/10 18:31:38 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/10 18:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 25', 'Predictor 0: Few-Shot Set 32'].
2025/01/10 18:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0]
2025/01/10 18:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 36 / 50 ==



Average Metric: 13.00 / 25 (52.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.01s/it]

2025/01/10 18:32:04 INFO dspy.evaluate.evaluate: Average Metric: 13 / 25 (52.0%)
2025/01/10 18:32:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 52.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 12'].
2025/01/10 18:32:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0]
2025/01/10 18:32:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:32:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:32:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 37 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:05<00:00,  4.56it/s]

2025/01/10 18:32:09 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:32:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 18'].
2025/01/10 18:32:09 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0]
2025/01/10 18:32:09 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:32:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:32:09 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 38 / 50 ==



Average Metric: 8.00 / 25 (32.0%): 100%|███████████████████████████████████████████████| 25/25 [00:26<00:00,  1.05s/it]

2025/01/10 18:32:36 INFO dspy.evaluate.evaluate: Average Metric: 8 / 25 (32.0%)
2025/01/10 18:32:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 32.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 8'].
2025/01/10 18:32:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0]
2025/01/10 18:32:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:32:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:32:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 39 / 50 ==



Average Metric: 10.00 / 25 (40.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.02it/s]

2025/01/10 18:33:00 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)
2025/01/10 18:33:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 36', 'Predictor 0: Few-Shot Set 25'].
2025/01/10 18:33:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0]
2025/01/10 18:33:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:33:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:33:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 40 / 50 ==



Average Metric: 13.00 / 25 (52.0%): 100%|██████████████████████████████████████████████| 25/25 [00:26<00:00,  1.04s/it]

2025/01/10 18:33:27 INFO dspy.evaluate.evaluate: Average Metric: 13 / 25 (52.0%)
2025/01/10 18:33:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 52.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 35', 'Predictor 0: Few-Shot Set 31'].
2025/01/10 18:33:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0]
2025/01/10 18:33:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8]
2025/01/10 18:33:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:33:27 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 4 =====
2025/01/10 18:33:27 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 62.0) f


Average Metric: 55.00 / 125 (44.0%):  50%|█████████████████████▌                     | 125/250 [01:39<01:51,  1.12it/s]

2025/01/10 18:35:07 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'the flag of the northwest american republic  the coming white homeland and ethnostate .', 'implicit_class': 'incitement'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 60.00 / 133 (45.1%):  54%|███████████████████████                    | 134/250 [01:49<02:00,  1.04s/it]

2025/01/10 18:35:17 ERROR dspy.utils.parallelizer: Error processing item Example({'post': 'i would think that pro white activity in any state is a boost to all states and especially to white activists .', 'implicit_class': 'incitement'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 65.00 / 140 (46.4%):  57%|████████████████████████▍                  | 142/250 [01:55<01:51,  1.03s/it]

2025/01/10 18:35:22 ERROR dspy.utils.parallelizer: Error processing item Example({'post': "the people hate the gop . don't worry  charlottesville was just the beginning . we're growing extremely fast .", 'implicit_class': 'incitement'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 121.00 / 247 (49.0%): 100%|██████████████████████████████████████████| 250/250 [03:21<00:00,  1.24it/s]

2025/01/10 18:36:48 INFO dspy.evaluate.evaluate: Average Metric: 121.0 / 250 (48.4%)
2025/01/10 18:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8
2025/01/10 18:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/10 18:36:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 41 / 50 ==



Average Metric: 7.00 / 25 (28.0%): 100%|███████████████████████████████████████████████| 25/25 [00:25<00:00,  1.01s/it]

2025/01/10 18:37:13 INFO dspy.evaluate.evaluate: Average Metric: 7 / 25 (28.0%)
2025/01/10 18:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 27', 'Predictor 0: Few-Shot Set 33'].
2025/01/10 18:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0]
2025/01/10 18:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:37:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 42 / 50 ==



Average Metric: 8.00 / 25 (32.0%): 100%|███████████████████████████████████████████████| 25/25 [00:05<00:00,  4.55it/s]

2025/01/10 18:37:19 INFO dspy.evaluate.evaluate: Average Metric: 8 / 25 (32.0%)
2025/01/10 18:37:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 32.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 15'].
2025/01/10 18:37:19 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0]
2025/01/10 18:37:19 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:37:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:37:19 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 43 / 50 ==



Average Metric: 9.00 / 25 (36.0%): 100%|███████████████████████████████████████████████| 25/25 [00:25<00:00,  1.03s/it]

2025/01/10 18:37:45 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)
2025/01/10 18:37:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 31', 'Predictor 0: Few-Shot Set 16'].
2025/01/10 18:37:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0]
2025/01/10 18:37:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:37:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:37:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 44 / 50 ==



Average Metric: 13.00 / 25 (52.0%): 100%|██████████████████████████████████████████████| 25/25 [00:24<00:00,  1.00it/s]

2025/01/10 18:38:10 INFO dspy.evaluate.evaluate: Average Metric: 13 / 25 (52.0%)
2025/01/10 18:38:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 52.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 19', 'Predictor 0: Few-Shot Set 36'].
2025/01/10 18:38:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0, 52.0]
2025/01/10 18:38:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:38:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:38:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 45 / 50 ==



Average Metric: 11.00 / 25 (44.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.08it/s]

2025/01/10 18:38:33 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)
2025/01/10 18:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 44.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 22', 'Predictor 0: Few-Shot Set 28'].
2025/01/10 18:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0, 52.0, 44.0]
2025/01/10 18:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 46 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:25<00:00,  1.02s/it]

2025/01/10 18:38:59 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/10 18:38:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 10'].
2025/01/10 18:38:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0, 52.0, 44.0, 64.0]
2025/01/10 18:38:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:38:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:38:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 47 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.05it/s]

2025/01/10 18:39:23 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/10 18:39:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 9'].
2025/01/10 18:39:23 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0, 52.0, 44.0, 64.0, 56.0]
2025/01/10 18:39:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:39:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:39:23 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 48 / 50 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████████| 25/25 [00:37<00:00,  1.52s/it]

2025/01/10 18:40:01 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/10 18:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 29', 'Predictor 0: Few-Shot Set 20'].
2025/01/10 18:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0, 52.0, 44.0, 64.0, 56.0, 64.0]
2025/01/10 18:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:40:01 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 49 / 50 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████████| 25/25 [00:23<00:00,  1.09it/s]

2025/01/10 18:40:24 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/10 18:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 29', 'Predictor 0: Few-Shot Set 31'].
2025/01/10 18:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0, 52.0, 44.0, 64.0, 56.0, 64.0, 56.0]
2025/01/10 18:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 50 / 50 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████████| 25/25 [00:22<00:00,  1.13it/s]

2025/01/10 18:40:46 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/10 18:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 29', 'Predictor 0: Few-Shot Set 19'].
2025/01/10 18:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [44.0, 60.0, 32.0, 36.0, 56.0, 48.0, 48.0, 48.0, 28.0, 48.0, 60.0, 72.0, 64.0, 36.0, 48.0, 56.0, 24.0, 40.0, 60.0, 56.0, 48.0, 44.0, 64.0, 44.0, 44.0, 40.0, 44.0, 36.0, 52.0, 48.0, 48.0, 32.0, 44.0, 32.0, 56.0, 52.0, 44.0, 32.0, 40.0, 52.0, 28.0, 32.0, 36.0, 52.0, 44.0, 64.0, 56.0, 64.0, 56.0, 72.0]
2025/01/10 18:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4]
2025/01/10 18:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8


2025/01/10 18:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 5 =====
2025/01/10 18:40:46 INFO dspy.teleprompt.mipro_optimizer_v2:


Average Metric: 90.00 / 187 (48.1%):  75%|████████████████████████████████▏          | 187/250 [02:33<01:22,  1.32s/it]

2025/01/10 18:43:21 ERROR dspy.utils.parallelizer: Error processing item Example({'post': "4396 white murderers  5375 black murderers . all the while blacks are 13 % of the country's population .", 'implicit_class': 'stereotypical'}) (input_keys={'post'}): litellm.RateLimitError: RateLimitError: MistralException - Error code: 429 - {'message': 'Requests rate limit exceeded'}. Set `provide_traceback=True` to see the stack trace.


Average Metric: 125.00 / 249 (50.2%): 100%|██████████████████████████████████████████| 250/250 [03:22<00:00,  1.24it/s]

2025/01/10 18:44:09 INFO dspy.evaluate.evaluate: Average Metric: 125.0 / 250 (50.0%)
2025/01/10 18:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [50.8, 49.2, 52.8, 46.8, 48.4, 50.0]
2025/01/10 18:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 52.8
2025/01/10 18:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/10 18:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 52.8!





In [20]:
optimized_classify_implicit_hate.save(optimizers_folder + model_name + '_' + 'optimizer_stg2_v2.json')

### Run optimized classifier

In [36]:
optimizer_file_name = optimizers_folder + model_name + '_' + 'optimizer_stg2.json'
result_file = result_folder + model_name + "_" + "result_optimizer_stg2.csv"
df_hate_stg2 = pd.read_csv(data_folder + "implicit_hate_test_stg2.csv", encoding="utf-8")

In [37]:
if debug_mode:
    df_hate_stg2 = df_hate_stg2.sample(2)

In [38]:
# Reload saved optimizer
optimized_classify_implicit_hate = classify_implicit_hate
optimized_classify_implicit_hate.load(optimizer_file_name)
optimized_classify_stage_2(optimized_classify_implicit_hate, df_hate_stg2, result_file, True)

## Optimization stage 3

In [39]:
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

In [40]:
def optimized_classify_stage_3(optimizer, df_hate, result_file, initialize = False):
    
    if initialize:
        # Regenerate output file
        df = pd.DataFrame(columns = ['post_id', 'target', 'implied_statement'])
        df.to_csv(result_file, encoding = "utf-8", index = False)
    else:
        df_elab = pd.read_csv(result_file, encoding = "utf-8")
        df_hate = df_hate[~df_hate['post_id'].isin(df_elab['post_id'])]

    for idx, row in df_hate.iterrows():
    
        try:            
            time.sleep(wait_time)
            classification = optimizer(post = row['post'])
        
            # Classification stage 3
            new_row = {
                'post_id': row['post_id'], 
                'target': classification.targeted_group, 
                'implied_statement': classification.implied_meaning
            }
            df = pd.DataFrame([new_row])
            df.to_csv(result_file, encoding = "utf-8", mode='a', index=False, header=False)
                      
        except:
          print("An exception occurred " +  str(row['post_id']) ) 

### Signature

In [41]:
with open(prompt_folder + 'prompt_stg3.txt') as f:
    prompt_stg3 = f.read()

In [42]:
prompt_tot_stg3 = prompt_stg3.replace("{tree_of_thoughts}", tree_of_thoughts)

In [43]:
print(prompt_tot_stg3)

The following text has been categorized as "implicit hate speech", a more complex and subtle attack on a person or a group of people based on their protected characteristics without relying on explicit keywords.

Your job is to infer both the targeted group (GROUP) and the stereotype, characteristic, or implication about that group expressed in the text. 
You have two tasks:

Task 1: Identify the targeted group (GROUP).
The group might be defined by characteristics such as ethnicity, religion, class, sexual orientation, immigration status, or similar traits. Examples of groups include:

-Black folks
-Asian folks
-Muslims
-Jews
-Latino/Latina folks
-Immigrants


Task 2: Determine what stereotype, characteristic, or action is implied about the identified group.
Using the group identified in Task 1, describe the implication using a simple phrase. Avoid copying text directly from the text. Examples of such phrases include:

-[GROUP] do/does [action]
-[GROUP] are [characteristic]
-[GROUP] k

In [44]:
class IdentifyImplicitMeaning(dspy.Signature):
    prompt_tot_stg3

    post: str = dspy.InputField()
    targeted_group: str = dspy.OutputField()
    implied_meaning : str = dspy.OutputField()

indentify_implicit_meaning = dspy.Predict(IdentifyImplicitMeaning)

### Metric

In [45]:
def implied_meaning_metric(example, prediction, trace=None):
    time.sleep(1)
    P, R, F1 = bert_score.score([example.implied_meaning], [prediction.implied_meaning], lang='en', verbose=False)
    bert_f1 = F1.mean().item()

    return bert_f1 > 0.90

### Training data

In [46]:
df_train = pd.read_csv(data_folder + "implicit_hate_train_stg3.csv", encoding="utf-8")

In [47]:
# Load the trainset of n samples
n = 100
trainset = []
for index, row in df_train.sample(n).iterrows():
    example = dspy.Example(post = row['post'], targeted_group = row['target'], implied_meaning = row['implied_statement']).with_inputs("post")
    trainset.append(example)

### Evaluator

In [None]:
evaluator = Evaluate(devset = trainset, num_threads = 1, display_progress = True, display_table = 5)
evaluator(indentify_implicit_meaning, metric = implied_meaning_metric)

### Optimization with MIPRO (prompt and examples)

In [None]:
print('Samples in trainset: {}'.format(len(trainset)))

identify_implicit_meaning_optimizer = dspy.MIPROv2(
    metric = implied_meaning_metric, 
    auto = "heavy"
)

optimized_identify_implicit_meaning = identify_implicit_meaning_optimizer.compile(
    indentify_implicit_meaning, 
    trainset = trainset, 
    max_labeled_demos = 20, 
    max_bootstrapped_demos = 20    
)

In [None]:
optimized_identify_implicit_meaning.save(optimizers_folder + model_name + '_' + 'optimizer_stg3.json')

### Run optimized classifier

In [49]:
optimizer_file_name = optimizers_folder + model_name + '_' + 'optimizer_stg3.json'
result_file = result_folder + model_name + '_' + 'result_optimizer_stg3.csv'
df_hate = pd.read_csv(data_folder + "implicit_hate_test_stg3.csv", encoding = "utf-8")
df_hate = df_hate[['post_id', 'post']].drop_duplicates()

In [50]:
if debug_mode:
    df_hate = df_hate.sample(2)

In [52]:
optimized_identify_implicit_meaning = indentify_implicit_meaning
optimized_identify_implicit_meaning.load(optimizer_file_name)
optimized_classify_stage_3(optimized_identify_implicit_meaning, df_hate, result_file, True)