### Setup AWS Credentials

In [None]:
import os
# Setup your AWS Access Key and Secret Key as environment variables.
os.environ["AWS_ACCESS_KEY_ID"]
os.environ["AWS_SECRET_ACCESS_KEY"] 

In [4]:
# Setup Nova Model
NOVA_MODEL_ID = "us.amazon.nova-pro-v1:0"

### Dataset Adapter

Initialize the Dataset Adapter that takes the input_columns and output_columns. We use the CSVDatasetAdapter to read a `.csv` file and adapt it to the standardized format. We also use the adapter to create train and test sets for our use case.

In [19]:
from amzn_nova_prompt_optimizer.core.input_adapters.dataset_adapter import CSVDatasetAdapter

input_columns = {"input"}
output_columns = {"answer"}

dataset_adapter = CSVDatasetAdapter(input_columns, output_columns)

# Adapt
dataset_adapter.adapt("../data/FacilitySupportAnalyzer.csv")

train_set, test_set = dataset_adapter.split(0.5)

### Prompt Adapter

Initialize the Prompt Adapter for the Original Prompt. For this example, we use the FacilitySupportAnalyzer System and User Prompt in the `.txt` format. 

In [20]:
from amzn_nova_prompt_optimizer.core.input_adapters.prompt_adapter import TextPromptAdapter

prompt_variables = input_columns

prompt_adapter = TextPromptAdapter()

prompt_adapter.set_system_prompt(file_path="original_prompt/system_prompt.txt", variables=prompt_variables)
prompt_adapter.set_user_prompt(file_path="original_prompt/user_prompt.txt", variables=prompt_variables)

# Adapt
prompt_adapter.adapt()

<amzn_nova_prompt_optimizer.core.input_adapters.prompt_adapter.TextPromptAdapter at 0x7fa169608560>

### Metric Adapter

Initialize the Metric Adapter for evaluating this prompt for certain optimizers. For this example, we build a Custom Metric for the FacilitySupportAnalyzer Dataset. The metric adapter requires the use of the `apply` [For single row evaluation] or `batch_apply` [For evaluating the whole dataset together] function

In [21]:
from amzn_nova_prompt_optimizer.core.input_adapters.metric_adapter import MetricAdapter
from typing import List, Any, Dict
import re
import json

class FacilitySupportAnalyzerMetric(MetricAdapter):
    def parse_json(self, input_string: str):
        """
        Attempts to parse the given string as JSON. If direct parsing fails,
        it tries to extract a JSON snippet from code blocks formatted as:
            ```json
            ... JSON content ...
            ```
        or any code block delimited by triple backticks and then parses that content.
        """
        try:
            return json.loads(input_string)
        except json.JSONDecodeError as err:
            error = err

        patterns = [
            re.compile(r"```json\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE),
            re.compile(r"```(.*?)```", re.DOTALL)
        ]

        for pattern in patterns:
            match = pattern.search(input_string)
            if match:
                json_candidate = match.group(1).strip()
                try:
                    return json.loads(json_candidate)
                except json.JSONDecodeError:
                    continue

        raise error

    def _calculate_metrics(self, y_pred: Any, y_true: Any) -> Dict:
        strict_json = False
        result = {
            "is_valid_json": False,
            "correct_categories": 0.0,
            "correct_sentiment": False,
            "correct_urgency": False,
        }

        try:
            y_true = y_true if isinstance(y_true, dict) else (json.loads(y_true) if strict_json else self.parse_json(y_true))
            y_pred = y_pred if isinstance(y_pred, dict) else (json.loads(y_pred) if strict_json else self.parse_json(y_pred))
        except json.JSONDecodeError:
            result["total"] = 0
            return result  # Return result with is_valid_json = False
        else:
            if isinstance(y_pred, str):
                result["total"] = 0
                return result  # Return result with is_valid_json = False
            result["is_valid_json"] = True

            categories_true = y_true.get("categories", {})
            categories_pred = y_pred.get("categories", {})

            if isinstance(categories_true, dict) and isinstance(categories_pred, dict):
                correct = sum(
                    categories_true.get(k, False) == categories_pred.get(k, False)
                    for k in categories_true
                )
                result["correct_categories"] = correct / len(categories_true) if categories_true else 0.0
            else:
                result["correct_categories"] = 0.0  # or raise an error if you prefer

            result["correct_sentiment"] = y_pred.get("sentiment", "") == y_true.get("sentiment", "")
            result["correct_urgency"] = y_pred.get("urgency", "") == y_true.get("urgency", "")

        # Compute overall metric score
        result["total"] = sum(
            float(result[k]) for k in ["correct_categories", "correct_sentiment", "correct_urgency"]
        ) / 3.0

        return result

    def apply(self, y_pred: Any, y_true: Any):
        return self._calculate_metrics(y_pred, y_true)

    def batch_apply(self, y_preds: List[Any], y_trues: List[Any]):
        evals = [self.apply(y_pred, y_true) for y_pred, y_true in zip(y_preds, y_trues)]
        float_keys = [k for k, v in evals[0].items() if isinstance(v, (int, float, bool))]
        return {k: sum(e[k] for e in evals) / len(evals) for k in float_keys}

metric_adapter = FacilitySupportAnalyzerMetric()

### Inference Adapter
Initialize the InferenceAdapter to choose the backend Inference. Currently, we only support BedrockInferenceAdapter.

In [22]:
from amzn_nova_prompt_optimizer.core.inference.adapter import BedrockInferenceAdapter

inference_adapter = BedrockInferenceAdapter(region_name="us-east-1")

### Evaluator

The Evaluator can use the metric_adapter, prompt_adapter, and dataset_adapter to evaluate the prompt given the `model_id` to produce an evaluation score. The Evaluator internally uses the `InferenceRunner` to first generate inference results and then evaluate the output.

#### Base Model Evaluation

In [23]:
from amzn_nova_prompt_optimizer.core.evaluation import Evaluator

evaluator = Evaluator(prompt_adapter, test_set, metric_adapter, inference_adapter)

In [24]:
original_prompt_score = evaluator.aggregate_score(model_id=NOVA_MODEL_ID)

print(f"Original Prompt Evaluation Score = {original_prompt_score}")

2025/07/02 20:46:37 INFO amzn_nova_prompt_optimizer.core.evaluation: Cache miss - Running new inference on Dataset
Running inference: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:48<00:00,  2.05it/s]
2025/07/02 20:47:25 INFO amzn_nova_prompt_optimizer.core.evaluation: Running Batch Evaluation on Dataset, using `batch_apply` metric
2025/07/02 20:47:25 INFO amzn_nova_prompt_optimizer.core.evaluation: Using cached inference results
2025/07/02 20:47:25 INFO amzn_nova_prompt_optimizer.core.evaluation: Running Evaluation on Dataset, using `apply` metric


Original Prompt Evaluation Score = {'is_valid_json': 0.31, 'correct_categories': 0.281, 'correct_sentiment': 0.2, 'correct_urgency': 0.26, 'total': 0.247}


### Optimization Adapter

We can now define the Optimization Functions. The Optimization function takes as input the Prompt Adapter and Optionally a Dataset Adapter, Inference Adapter, and Metric Adapter. The optimization function optimizes the prompt and returns a Prompt Adapter.

In [9]:
class FacilitySupportAnalyzerNovaMetric(FacilitySupportAnalyzerMetric):
    def apply(self, y_pred: Any, y_true: Any):
        # Requires to return a value and not a JSON payload
        return self._calculate_metrics(y_pred, y_true)["total"]
        
    def batch_apply(self, y_preds: List[Any], y_trues: List[Any]):
        pass
nova_metric_adapter = FacilitySupportAnalyzerNovaMetric()

#### NovaPromptOptimizer

NovaPromptOptimizer = Nova Meta Prompter + MIPROv2 with Nova Model Tips

In [10]:
from amzn_nova_prompt_optimizer.core.optimizers import NovaPromptOptimizer

nova_prompt_optimizer = NovaPromptOptimizer(prompt_adapter=prompt_adapter, inference_adapter=inference_adapter, dataset_adapter=train_set, metric_adapter=nova_metric_adapter)

optimized_prompt_adapter = nova_prompt_optimizer.optimize(mode="pro")

2025/07/02 20:18:53 INFO amzn_nova_prompt_optimizer.core.optimizers.nova_meta_prompter.nova_mp_optimizer: Optimizing prompt using Nova Meta Prompter
2025/07/02 20:18:59 INFO amzn_nova_prompt_optimizer.core.optimizers.miprov2.miprov2_optimizer: Using us.amazon.nova-pro-v1:0 for Evaluation
2025/07/02 20:18:59 INFO amzn_nova_prompt_optimizer.core.optimizers.miprov2.miprov2_optimizer: Using us.amazon.nova-premier-v1:0 for Prompting
2025/07/02 20:18:59 INFO amzn_nova_prompt_optimizer.core.optimizers.miprov2.custom_adapters.custom_chat_adapter: Initializing CustomChatAdapter with enable_json_fallback=False
2025/07/02 20:18:59 INFO amzn_nova_prompt_optimizer.core.optimizers.miprov2.miprov2_optimizer: Using Nova tips for MIPROv2 optimization
2025/07/02 20:18:59 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/07/02 20:18:59 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instruc

Bootstrapping set 1/20
Bootstrapping set 2/20
Bootstrapping set 3/20


  8%|███████▋                                                                                        | 4/50 [00:06<01:10,  1.54s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/20


  8%|███████▋                                                                                        | 4/50 [00:05<00:58,  1.28s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/20


  6%|█████▊                                                                                          | 3/50 [00:04<01:05,  1.40s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 6/20


  4%|███▊                                                                                            | 2/50 [00:03<01:18,  1.64s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 7/20


  2%|█▉                                                                                              | 1/50 [00:01<01:23,  1.71s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 8/20


  6%|█████▊                                                                                          | 3/50 [00:04<01:18,  1.66s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 9/20


  8%|███████▋                                                                                        | 4/50 [00:05<01:01,  1.34s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 10/20


  6%|█████▊                                                                                          | 3/50 [00:04<01:05,  1.40s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 11/20


  2%|█▉                                                                                              | 1/50 [00:01<01:00,  1.23s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 12/20


  6%|█████▊                                                                                          | 3/50 [00:04<01:04,  1.38s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 13/20


  6%|█████▊                                                                                          | 3/50 [00:04<01:03,  1.35s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 14/20


  4%|███▊                                                                                            | 2/50 [00:02<00:56,  1.17s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 15/20


  4%|███▊                                                                                            | 2/50 [00:02<01:06,  1.39s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 16/20


  8%|███████▋                                                                                        | 4/50 [00:05<01:03,  1.39s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 17/20


  8%|███████▋                                                                                        | 4/50 [00:05<01:04,  1.39s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 18/20


  6%|█████▊                                                                                          | 3/50 [00:04<01:11,  1.53s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 19/20


  2%|█▉                                                                                              | 1/50 [00:01<01:22,  1.69s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 20/20


  2%|█▉                                                                                              | 1/50 [00:01<01:07,  1.38s/it]
2025/07/02 20:20:07 INFO amzn_nova_prompt_optimizer.core.optimizers.miprov2.miprov2_optimizer: Entering patched_propose_instructions, patching GroundedProposer with NovaGroundedProposer
2025/07/02 20:20:07 INFO amzn_nova_prompt_optimizer.core.optimizers.miprov2.miprov2_optimizer: Patched GroundedProposer, current GroundedProposer class=<class 'amzn_nova_prompt_optimizer.core.optimizers.nova_prompt_optimizer.nova_grounded_proposer.NovaGroundedProposer'>
2025/07/02 20:20:07 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/07/02 20:20:07 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/07/02 20:20:07 INFO amzn_nova_prompt_optimizer.core.op

Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/07/02 20:20:27 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=20 instructions...



[Nova] Selected tip: persona
[Nova] Selected tip: persona
[Nova] Selected tip: multi_turn
[Nova] Selected tip: multi_turn
[Nova] Selected tip: multi_turn
[Nova] Selected tip: multi_turn
[Nova] Selected tip: simple
[Nova] Selected tip: high_stakes
[Nova] Selected tip: high_stakes
[Nova] Selected tip: format_control
[Nova] Selected tip: creative
[Nova] Selected tip: rules_based
[Nova] Selected tip: none
[Nova] Selected tip: simple
[Nova] Selected tip: multi_turn
[Nova] Selected tip: simple
[Nova] Selected tip: high_stakes
[Nova] Selected tip: structured_prompt
[Nova] Selected tip: simple
[Nova] Selected tip: multi_turn


2025/07/02 20:22:48 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/07/02 20:22:48 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Task: Extract and return a json with specified keys and values based on the input.

Context:
- The json must include "urgency", "sentiment", and "categories".
- "urgency" can be `high`, `medium`, or `low`.
- "sentiment" can be `negative`, `neutral`, or `positive`.
- "categories" is a dictionary with boolean values indicating if the category matches the input.

Instructions:
- The json MUST be valid and readable directly.
- DO NOT enclose the json in ```json...```.
- DO NOT include newlines or unnecessary whitespaces.
- Only include the keys mentioned: "urgency", "sentiment", and "categories".

Any other section from Original Prompt:
- The categories to consider are: `emergency_repair_services`, `routine_maintenance_requests`, `quality_and_safety_concerns`, `specialized_cleaning_services`, `general_inquiries`, `sustainabilit

2025/07/02 20:22:48 INFO dspy.teleprompt.mipro_optimizer_v2: 14: Given the input email, analyze its content to determine the urgency, sentiment, and relevant categories. Return a JSON object with "urgency" (high, medium, low), "sentiment" (negative, neutral, positive), and "categories" (boolean values for predefined types). Ensure the output is a valid, single-line JSON string without unnecessary whitespace. If the email's content is ambiguous, prioritize categories that best match explicit mentions of issues or requests.

2025/07/02 20:22:48 INFO dspy.teleprompt.mipro_optimizer_v2: 15: Analyze the input email to classify the request. Return a JSON object with "urgency" (high/medium/low), "sentiment" (positive/neutral/negative), and "categories" (boolean values for predefined types). Ensure no extra keys or formatting.

2025/07/02 20:22:48 INFO dspy.teleprompt.mipro_optimizer_v2: 16: Given the critical nature of customer service in maintaining client satisfaction and operational effici

Average Metric: 35.20 / 50 (70.4%): 100%|███████████████████████████████████████████████████████████| 50/50 [00:28<00:00,  1.75it/s]

2025/07/02 20:23:17 INFO dspy.evaluate.evaluate: Average Metric: 35.199999999999996 / 50 (70.4%)
2025/07/02 20:23:17 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 70.4

2025/07/02 20:23:17 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 37 - Minibatch ==



Average Metric: 25.33 / 35 (72.4%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.35it/s]

2025/07/02 20:23:43 INFO dspy.evaluate.evaluate: Average Metric: 25.333333333333332 / 35 (72.4%)
2025/07/02 20:23:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.38 on minibatch of size 35 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 6'].
2025/07/02 20:23:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38]
2025/07/02 20:23:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4]
2025/07/02 20:23:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.4


2025/07/02 20:23:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 37 - Minibatch ==



Average Metric: 24.03 / 35 (68.7%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:23<00:00,  1.48it/s]

2025/07/02 20:24:07 INFO dspy.evaluate.evaluate: Average Metric: 24.03333333333333 / 35 (68.7%)
2025/07/02 20:24:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.67 on minibatch of size 35 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 4'].
2025/07/02 20:24:07 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67]
2025/07/02 20:24:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4]
2025/07/02 20:24:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.4


2025/07/02 20:24:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 37 - Minibatch ==



Average Metric: 25.13 / 35 (71.8%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:27<00:00,  1.27it/s]

2025/07/02 20:24:34 INFO dspy.evaluate.evaluate: Average Metric: 25.133333333333333 / 35 (71.8%)
2025/07/02 20:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.81 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 13'].
2025/07/02 20:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81]
2025/07/02 20:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4]
2025/07/02 20:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.4


2025/07/02 20:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 37 - Minibatch ==



Average Metric: 23.70 / 35 (67.7%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:26<00:00,  1.35it/s]

2025/07/02 20:25:00 INFO dspy.evaluate.evaluate: Average Metric: 23.7 / 35 (67.7%)
2025/07/02 20:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 7'].
2025/07/02 20:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71]
2025/07/02 20:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4]
2025/07/02 20:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.4


2025/07/02 20:25:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 37 - Minibatch ==



Average Metric: 22.80 / 35 (65.1%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:24<00:00,  1.40it/s]

2025/07/02 20:25:25 INFO dspy.evaluate.evaluate: Average Metric: 22.8 / 35 (65.1%)
2025/07/02 20:25:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9'].
2025/07/02 20:25:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14]
2025/07/02 20:25:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4]
2025/07/02 20:25:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.4


2025/07/02 20:25:25 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 37 - Full Evaluation =====
2025/07/02 20:25:25 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 72.38) from minibatch trials...



Average Metric: 36.60 / 50 (73.2%): 100%|███████████████████████████████████████████████████████████| 50/50 [00:11<00:00,  4.36it/s]

2025/07/02 20:25:37 INFO dspy.evaluate.evaluate: Average Metric: 36.6 / 50 (73.2%)
2025/07/02 20:25:37 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 73.2
2025/07/02 20:25:37 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2]
2025/07/02 20:25:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2
2025/07/02 20:25:37 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/02 20:25:37 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 37 - Minibatch ==



Average Metric: 24.67 / 35 (70.5%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:24<00:00,  1.44it/s]

2025/07/02 20:26:01 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666664 / 35 (70.5%)
2025/07/02 20:26:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 15'].
2025/07/02 20:26:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48]
2025/07/02 20:26:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2]
2025/07/02 20:26:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:26:01 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 37 - Minibatch ==



Average Metric: 25.37 / 35 (72.5%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:28<00:00,  1.23it/s]

2025/07/02 20:26:29 INFO dspy.evaluate.evaluate: Average Metric: 25.366666666666667 / 35 (72.5%)
2025/07/02 20:26:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 17'].
2025/07/02 20:26:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48]
2025/07/02 20:26:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2]
2025/07/02 20:26:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:26:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 37 - Minibatch ==



Average Metric: 23.23 / 35 (66.4%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:24<00:00,  1.46it/s]

2025/07/02 20:26:53 INFO dspy.evaluate.evaluate: Average Metric: 23.233333333333334 / 35 (66.4%)
2025/07/02 20:26:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.38 on minibatch of size 35 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 9'].
2025/07/02 20:26:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38]
2025/07/02 20:26:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2]
2025/07/02 20:26:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:26:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 37 - Minibatch ==



Average Metric: 25.13 / 35 (71.8%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:23<00:00,  1.46it/s]

2025/07/02 20:27:17 INFO dspy.evaluate.evaluate: Average Metric: 25.133333333333333 / 35 (71.8%)
2025/07/02 20:27:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.81 on minibatch of size 35 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 19'].
2025/07/02 20:27:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81]
2025/07/02 20:27:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2]
2025/07/02 20:27:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:27:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 37 - Minibatch ==



Average Metric: 25.17 / 35 (71.9%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:26<00:00,  1.34it/s]

2025/07/02 20:27:44 INFO dspy.evaluate.evaluate: Average Metric: 25.166666666666668 / 35 (71.9%)
2025/07/02 20:27:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.9 on minibatch of size 35 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 10'].
2025/07/02 20:27:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9]
2025/07/02 20:27:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2]
2025/07/02 20:27:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:27:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 37 - Full Evaluation =====
2025/07/02 20:27:44 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 72.48) from minibatch trials...



Average Metric: 35.53 / 50 (71.1%): 100%|███████████████████████████████████████████████████████████| 50/50 [00:10<00:00,  4.75it/s]

2025/07/02 20:27:54 INFO dspy.evaluate.evaluate: Average Metric: 35.53333333333333 / 50 (71.1%)
2025/07/02 20:27:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07]
2025/07/02 20:27:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2
2025/07/02 20:27:54 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/02 20:27:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 37 - Minibatch ==



Average Metric: 24.43 / 35 (69.8%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:27<00:00,  1.29it/s]

2025/07/02 20:28:21 INFO dspy.evaluate.evaluate: Average Metric: 24.433333333333334 / 35 (69.8%)
2025/07/02 20:28:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.81 on minibatch of size 35 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 12'].
2025/07/02 20:28:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81]
2025/07/02 20:28:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07]
2025/07/02 20:28:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:28:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 37 - Minibatch ==



Average Metric: 24.37 / 35 (69.6%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:26<00:00,  1.31it/s]

2025/07/02 20:28:48 INFO dspy.evaluate.evaluate: Average Metric: 24.366666666666667 / 35 (69.6%)
2025/07/02 20:28:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.62 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 17'].
2025/07/02 20:28:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62]
2025/07/02 20:28:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07]
2025/07/02 20:28:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:28:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 37 - Minibatch ==



Average Metric: 22.70 / 35 (64.9%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.36it/s]

2025/07/02 20:29:14 INFO dspy.evaluate.evaluate: Average Metric: 22.7 / 35 (64.9%)
2025/07/02 20:29:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 12'].
2025/07/02 20:29:14 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86]
2025/07/02 20:29:14 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07]
2025/07/02 20:29:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:29:14 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 37 - Minibatch ==



Average Metric: 22.97 / 35 (65.6%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.38it/s]

2025/07/02 20:29:39 INFO dspy.evaluate.evaluate: Average Metric: 22.966666666666665 / 35 (65.6%)
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.62 on minibatch of size 35 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 16'].
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62]
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07]
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 37 - Minibatch ==



Average Metric: 26.17 / 35 (74.8%): 100%|█████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 2602.71it/s]

2025/07/02 20:29:39 INFO dspy.evaluate.evaluate: Average Metric: 26.166666666666668 / 35 (74.8%)
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.76 on minibatch of size 35 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 17'].
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76]
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07]
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.2


2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 37 - Full Evaluation =====
2025/07/02 20:29:39 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 71.9) from minibatch trials...



Average Metric: 36.83 / 50 (73.7%): 100%|███████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.72it/s]

2025/07/02 20:29:53 INFO dspy.evaluate.evaluate: Average Metric: 36.833333333333336 / 50 (73.7%)
2025/07/02 20:29:53 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 73.67
2025/07/02 20:29:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67]
2025/07/02 20:29:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.67
2025/07/02 20:29:53 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/02 20:29:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 37 - Minibatch ==



Average Metric: 24.67 / 35 (70.5%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:26<00:00,  1.34it/s]

2025/07/02 20:30:19 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666664 / 35 (70.5%)
2025/07/02 20:30:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 17', 'Predictor 0: Few-Shot Set 1'].
2025/07/02 20:30:19 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48]
2025/07/02 20:30:19 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67]
2025/07/02 20:30:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.67


2025/07/02 20:30:19 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 37 - Minibatch ==



Average Metric: 24.90 / 35 (71.1%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:27<00:00,  1.25it/s]

2025/07/02 20:30:47 INFO dspy.evaluate.evaluate: Average Metric: 24.9 / 35 (71.1%)
2025/07/02 20:30:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 19', 'Predictor 0: Few-Shot Set 3'].
2025/07/02 20:30:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14]
2025/07/02 20:30:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67]
2025/07/02 20:30:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.67


2025/07/02 20:30:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 37 - Minibatch ==



Average Metric: 26.87 / 35 (76.8%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:50<00:00,  1.46s/it]

2025/07/02 20:31:38 INFO dspy.evaluate.evaluate: Average Metric: 26.866666666666667 / 35 (76.8%)
2025/07/02 20:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.76 on minibatch of size 35 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 5'].
2025/07/02 20:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76]
2025/07/02 20:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67]
2025/07/02 20:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.67


2025/07/02 20:31:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 23 / 37 - Minibatch ==



Average Metric: 25.07 / 35 (71.6%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:08<00:00,  3.90it/s]

2025/07/02 20:31:47 INFO dspy.evaluate.evaluate: Average Metric: 25.066666666666666 / 35 (71.6%)
2025/07/02 20:31:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.62 on minibatch of size 35 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 5'].
2025/07/02 20:31:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62]
2025/07/02 20:31:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67]
2025/07/02 20:31:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.67


2025/07/02 20:31:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 24 / 37 - Minibatch ==



Average Metric: 25.60 / 35 (73.1%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:23<00:00,  1.48it/s]

2025/07/02 20:32:11 INFO dspy.evaluate.evaluate: Average Metric: 25.599999999999998 / 35 (73.1%)
2025/07/02 20:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 73.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/07/02 20:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14]
2025/07/02 20:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67]
2025/07/02 20:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 73.67


2025/07/02 20:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 37 - Full Evaluation =====
2025/07/02 20:32:11 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 74.19) from minibatch trials...



Average Metric: 37.00 / 50 (74.0%): 100%|███████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 28.03it/s]

2025/07/02 20:32:12 INFO dspy.evaluate.evaluate: Average Metric: 37.0 / 50 (74.0%)
2025/07/02 20:32:12 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 74.0
2025/07/02 20:32:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0]
2025/07/02 20:32:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0
2025/07/02 20:32:12 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/02 20:32:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 26 / 37 - Minibatch ==



Average Metric: 24.57 / 35 (70.2%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.37it/s]

2025/07/02 20:32:38 INFO dspy.evaluate.evaluate: Average Metric: 24.566666666666666 / 35 (70.2%)
2025/07/02 20:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.19 on minibatch of size 35 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 9'].
2025/07/02 20:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19]
2025/07/02 20:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0]
2025/07/02 20:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:32:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 27 / 37 - Minibatch ==



Average Metric: 26.30 / 35 (75.1%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.37it/s]

2025/07/02 20:33:03 INFO dspy.evaluate.evaluate: Average Metric: 26.3 / 35 (75.1%)
2025/07/02 20:33:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 11'].
2025/07/02 20:33:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14]
2025/07/02 20:33:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0]
2025/07/02 20:33:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:33:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 28 / 37 - Minibatch ==



Average Metric: 24.37 / 35 (69.6%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.36it/s]

2025/07/02 20:33:29 INFO dspy.evaluate.evaluate: Average Metric: 24.366666666666667 / 35 (69.6%)
2025/07/02 20:33:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.62 on minibatch of size 35 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 14'].
2025/07/02 20:33:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62]
2025/07/02 20:33:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0]
2025/07/02 20:33:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:33:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 29 / 37 - Minibatch ==



Average Metric: 26.27 / 35 (75.0%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.36it/s]

2025/07/02 20:33:55 INFO dspy.evaluate.evaluate: Average Metric: 26.266666666666666 / 35 (75.0%)
2025/07/02 20:33:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.05 on minibatch of size 35 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 11'].
2025/07/02 20:33:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62, 75.05]
2025/07/02 20:33:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0]
2025/07/02 20:33:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:33:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 30 / 37 - Minibatch ==



Average Metric: 24.33 / 35 (69.5%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:27<00:00,  1.25it/s]

2025/07/02 20:34:23 INFO dspy.evaluate.evaluate: Average Metric: 24.333333333333332 / 35 (69.5%)
2025/07/02 20:34:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.52 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 11'].
2025/07/02 20:34:23 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62, 75.05, 69.52]
2025/07/02 20:34:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0]
2025/07/02 20:34:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:34:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 31 / 37 - Full Evaluation =====
2025/07/02 20:34:23 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 75.14) from minibatch trials.


Average Metric: 36.20 / 50 (72.4%): 100%|███████████████████████████████████████████████████████████| 50/50 [00:11<00:00,  4.44it/s]

2025/07/02 20:34:34 INFO dspy.evaluate.evaluate: Average Metric: 36.199999999999996 / 50 (72.4%)
2025/07/02 20:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0, 72.4]
2025/07/02 20:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0
2025/07/02 20:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/02 20:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 32 / 37 - Minibatch ==



Average Metric: 22.37 / 35 (63.9%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:27<00:00,  1.25it/s]

2025/07/02 20:35:02 INFO dspy.evaluate.evaluate: Average Metric: 22.366666666666667 / 35 (63.9%)
2025/07/02 20:35:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.9 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 0'].
2025/07/02 20:35:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62, 75.05, 69.52, 63.9]
2025/07/02 20:35:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0, 72.4]
2025/07/02 20:35:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:35:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 33 / 37 - Minibatch ==



Average Metric: 24.47 / 35 (69.9%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.35it/s]

2025/07/02 20:35:28 INFO dspy.evaluate.evaluate: Average Metric: 24.466666666666665 / 35 (69.9%)
2025/07/02 20:35:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.9 on minibatch of size 35 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 7'].
2025/07/02 20:35:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62, 75.05, 69.52, 63.9, 69.9]
2025/07/02 20:35:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0, 72.4]
2025/07/02 20:35:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:35:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 34 / 37 - Minibatch ==



Average Metric: 25.93 / 35 (74.1%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:30<00:00,  1.13it/s]

2025/07/02 20:35:59 INFO dspy.evaluate.evaluate: Average Metric: 25.933333333333334 / 35 (74.1%)
2025/07/02 20:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.1 on minibatch of size 35 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 11'].
2025/07/02 20:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62, 75.05, 69.52, 63.9, 69.9, 74.1]
2025/07/02 20:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0, 72.4]
2025/07/02 20:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:35:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 35 / 37 - Minibatch ==



Average Metric: 25.80 / 35 (73.7%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:26<00:00,  1.32it/s]

2025/07/02 20:36:26 INFO dspy.evaluate.evaluate: Average Metric: 25.8 / 35 (73.7%)
2025/07/02 20:36:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 73.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 11'].
2025/07/02 20:36:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62, 75.05, 69.52, 63.9, 69.9, 74.1, 73.71]
2025/07/02 20:36:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0, 72.4]
2025/07/02 20:36:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:36:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 36 / 37 - Minibatch ==



Average Metric: 22.57 / 35 (64.5%): 100%|███████████████████████████████████████████████████████████| 35/35 [00:27<00:00,  1.27it/s]

2025/07/02 20:36:53 INFO dspy.evaluate.evaluate: Average Metric: 22.566666666666666 / 35 (64.5%)
2025/07/02 20:36:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 8'].
2025/07/02 20:36:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.38, 68.67, 71.81, 67.71, 65.14, 70.48, 72.48, 66.38, 71.81, 71.9, 69.81, 69.62, 64.86, 65.62, 74.76, 70.48, 71.14, 76.76, 71.62, 73.14, 70.19, 75.14, 69.62, 75.05, 69.52, 63.9, 69.9, 74.1, 73.71, 64.48]
2025/07/02 20:36:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0, 72.4]
2025/07/02 20:36:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0


2025/07/02 20:36:53 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 37 / 37 - Full Evaluation =====
2025/07/02 20:36:53 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Av


Average Metric: 36.47 / 50 (72.9%): 100%|███████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.84it/s]

2025/07/02 20:37:06 INFO dspy.evaluate.evaluate: Average Metric: 36.46666666666667 / 50 (72.9%)
2025/07/02 20:37:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.4, 73.2, 71.07, 73.67, 74.0, 72.4, 72.93]
2025/07/02 20:37:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.0
2025/07/02 20:37:06 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/02 20:37:06 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 74.0!





In [11]:
optimized_prompt_adapter.show()

2025/07/02 20:37:07 INFO amzn_nova_prompt_optimizer.core.input_adapters.prompt_adapter: 
Standardized Prompt:


{
  "user_prompt": {
    "variables": [
      "input"
    ],
    "template": "Extract and return a json with the following keys and values from the input: [{{input}}]\n- \"urgency\" as one of `high`, `medium`, `low`\n- \"sentiment\" as one of `negative`, `neutral`, `positive`\n- \"categories\" as a dictionary with boolean values for: `emergency_repair_services`, `routine_maintenance_requests`, `quality_and_safety_concerns`, `specialized_cleaning_services`, `general_inquiries`, `sustainability_and_environmental_practices`, `training_and_support_requests`, `cleaning_services_scheduling`, `customer_feedback_and_complaints`, `facility_management_issues`\n\nYour complete message should be a valid json string that can be read directly and only contain the keys mentioned in the list above. Never enclose it in ```json...```, no newlines, no unnecessary whitespaces.",
    "metadata": {
      "format": "text"
    }
  },
  "system_prompt": {
    "variables": [],
    "template": "Analyze the custo

### Optimized System Prompt

In [12]:
print(optimized_prompt_adapter.system_prompt)

Analyze the customer service email to classify its content accurately. Return a JSON object with "urgency" (high/medium/low), "sentiment" (negative/neutral/positive), and "categories" (boolean flags for predefined types). Ensure compliance with data privacy standards by omitting personal identifiers. Follow the format strictly: `{"urgency": "...", "sentiment": "...", "categories": {"...": true/false, ...}}`.


### Optimized User Prompt

In [13]:
print(optimized_prompt_adapter.user_prompt)

Extract and return a json with the following keys and values from the input: [{{input}}]
- "urgency" as one of `high`, `medium`, `low`
- "sentiment" as one of `negative`, `neutral`, `positive`
- "categories" as a dictionary with boolean values for: `emergency_repair_services`, `routine_maintenance_requests`, `quality_and_safety_concerns`, `specialized_cleaning_services`, `general_inquiries`, `sustainability_and_environmental_practices`, `training_and_support_requests`, `cleaning_services_scheduling`, `customer_feedback_and_complaints`, `facility_management_issues`

Your complete message should be a valid json string that can be read directly and only contain the keys mentioned in the list above. Never enclose it in ```json...```, no newlines, no unnecessary whitespaces.


### Few Shot Examples

In [14]:
print(f"Number of Few-Shot Examples = {len(optimized_prompt_adapter.few_shot_examples)}")

Number of Few-Shot Examples = 4


In [15]:
# Print only the first example
print(optimized_prompt_adapter.few_shot_examples[0])

{'input': 'Extract and return a json with the following keys and values from the input: [Subject: Urgent Request for Training and Support\n\nHi ProCare Support Team,\n\nI hope this message finds you well. My name is [Sender], and I manage a residential complex that has been utilizing ProCare Facility Solutions for our maintenance and cleaning needs for the past year. Your services have been instrumental in keeping our environment safe and well-maintained.\n\nHowever, I am reaching out with an urgent request. We are in immediate need of comprehensive training for our in-house maintenance team. Given the complexity of our facility\'s systems, particularly the HVAC and electrical components, it\'s crucial that our team is well-versed in best practices and troubleshooting techniques. \n\nWe have encountered several issues recently that have highlighted gaps in our current knowledge and capabilities. While we have managed to address these problems temporarily, a more sustainable solution is

### Evaluator

Now we evaluate the Nova Prompt Optimizer Optimized prompt

In [16]:
from amzn_nova_prompt_optimizer.core.evaluation import Evaluator

evaluator = Evaluator(optimized_prompt_adapter, test_set, metric_adapter, inference_adapter)

In [17]:
nova_prompt_optimizer_evaluation_score = evaluator.aggregate_score(model_id=NOVA_MODEL_ID)
print(f"Nova Prompt Optimizer = {nova_prompt_optimizer_evaluation_score}")

2025/07/02 20:40:54 INFO amzn_nova_prompt_optimizer.core.evaluation: Cache miss - Running new inference on Dataset
Running inference: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:45<00:00,  2.18it/s]
2025/07/02 20:41:40 INFO amzn_nova_prompt_optimizer.core.evaluation: Running Batch Evaluation on Dataset, using `batch_apply` metric
2025/07/02 20:41:40 INFO amzn_nova_prompt_optimizer.core.evaluation: Using cached inference results
2025/07/02 20:41:40 INFO amzn_nova_prompt_optimizer.core.evaluation: Running Evaluation on Dataset, using `apply` metric


Nova Prompt Optimizer = {'is_valid_json': 1.0, 'correct_categories': 0.95, 'correct_sentiment': 0.65, 'correct_urgency': 0.77, 'total': 0.79}


In [18]:
optimized_prompt_adapter.save("optimized_prompt/")