In [1]:
HF_USR_NAME = 'shirwu'
TOOL_QA_ROOT = '/dfs/project/kgrlm/shirwu/msr_intern/home/t-yingxinwu/msr_intern/ToolQA-rebuttal'

### Upload to Huggingface

In [3]:
import pandas as pd
import os
from datasets import Dataset
from datasets import DatasetDict

level = 'hard'
dataset = 'scirex'

dataset_dir = f'{dataset}-{level}.jsonl'
hf_dataset_name = f'toolqa_{dataset}_{level}'

df = pd.read_json(dataset_dir, lines=True)
df.head()

df['answer'] = df['answer'].apply(lambda x: str(x))
dataset = Dataset.from_pandas(df)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable tokenizer parallelism warning


In [4]:
dataset_dict = DatasetDict({'train': dataset})
# push to hf for the ease for using dspy
# dataset_dict.push_to_hub(repo_id=hf_dataset_name, private=True)

## Setting Up

* ToolQA

Before loading our datasets and going to the execution part, we'll need to configure the `lm` in `dspy.settings`. For the purpose of this notebook we'll be using `gpt-4o`.

In [5]:
import os
import dspy
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning) 


dspy.settings.configure(
    lm=dspy.OpenAI(
        model="gpt-4o",
        api_key=os.getenv("OPENAI_API_KEY"),
        max_tokens=4000,
        temperature=0
    )
)

## Defining Signature

In [6]:
class ToolQASignature(dspy.Signature):
    """You will be given a question. Your task is to answer the question with a short response. 
    """
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
        format=lambda x: x.strip(),
    )
    answer: str = dspy.OutputField(
        prefix="Answer:",
        desc="answer to the question",
    )


## Loading Datasets

In [7]:
from random import sample
from dspy.datasets import DataLoader

dl = DataLoader()

In [8]:
tool_qa = dl.from_huggingface(
    f'{HF_USR_NAME}/' + hf_dataset_name,
    split="train",
    input_keys=("question", "answer"),
)

In [9]:
len(tool_qa)

100

In [9]:
import random
# set seed
random.seed(42)

train_idx = random.sample(range(len(tool_qa)), 40)
remaining_idx = list(set(range(len(tool_qa))) - set(train_idx))
test_idx = random.sample(remaining_idx, 60)

toolqa_train = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in train_idx]
]
toolqa_test = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in test_idx]
]

## Setting Up Tools

We'll setup `Avatar` modules for both signatures and all the `tools` can be used by each of the dataset. `Tool` is a pydantic model that Avatar expects the `tools` to be composed as more specifically it have 4 fields:

* `name` : Name of the tool
* `input_type` : Type of input the tool accepts
* `output_type` : Type of output the tool returns
* `tool` : The actual tool object

In [10]:
import os
import time
import uuid
import numpy as np
import jsonlines
from concurrent.futures import ProcessPoolExecutor
import sentence_transformers
import chromadb
from os import path as osp
from chromadb.config import Settings

EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CHROMA_PERSIST_DIRECTORY = osp.join(TOOL_QA_ROOT, "data/chroma_db/scirex-v2")
CHROMA_COLLECTION_NAME = "all"
CHROMA_SERVER_HOST = "localhost"
CHROMA_SERVER_HTTP_PORT = "8000"
FILE_PATH = osp.join(TOOL_QA_ROOT, "data/external_corpus/scirex/Preprocessed_Scirex.jsonl")

def sentence_embedding(model, texts):
    embeddings = model.encode(texts)
    return embeddings

def create_chroma_db(chroma_server_host, chroma_server_http_port, collection_name):
    chroma_client = chromadb.Client(Settings(
        chroma_api_impl="rest",
        chroma_server_host=chroma_server_host,
        chroma_server_http_port=chroma_server_http_port,
    ))
    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def create_chroma_db_local(persist_directory, collection_name):
    chroma_client = chromadb.PersistentClient(path=persist_directory)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def insert_to_db(texts, model_name, cuda_idx, db):
    # use cpu
    model = sentence_transformers.SentenceTransformer(model_name, device='cpu')
    # model = sentence_transformers.SentenceTransformer(model_name, device=f"cuda:{cuda_idx}")

    batch_embeddings = []
    batch_texts = []
    start_time = time.time()
    print(f"Total Articles to process: {len(texts)}, Current Thread: {cuda_idx}.")
    for i, text in enumerate(texts):
        # 2. generate embedding
        embeddings = sentence_embedding(model, text).tolist()

        batch_embeddings.append(embeddings)
        batch_texts.append(text)
        # 3. add to vectorstore per 500 articles or last article
        if i % 100 == 0 or i == len(texts)-1:
            batch_ids = [str(uuid.uuid1()) for _ in batch_texts]
            db.add(
                embeddings=batch_embeddings,
                documents=batch_texts,
                ids = batch_ids
            )
            batch_embeddings = []
            batch_texts = []
            print(f"Completed Processing article count: {i}, Current Thread: {cuda_idx}, Time took: {time.time() - start_time}.")
    print(f"Thread {cuda_idx} Completed. Total time took for thread: {time.time() - start_time}.")


# Multi-processing
def query_llm(query, is_local=True, start=None, end=None):
    cuda_idxes = [0]
    number_of_processes = len(cuda_idxes)
    input_texts = []
    db = create_chroma_db_local(CHROMA_PERSIST_DIRECTORY, CHROMA_COLLECTION_NAME)
    with open(FILE_PATH, 'r') as f:
        for item in jsonlines.Reader(f):
            input_texts.append(item["content"])
    # input_texts = np.array_split(input_texts, number_of_processes)

    args = ((input_texts[i], EMBED_MODEL_NAME, cuda_idxes[i], is_local) for i in range(number_of_processes))

    # if there is no file under the directory "/localscratch/yzhuang43/ra-llm/retrieval_benchmark/data/chroma_db/agenda", insert the data into the db
    # You should run insert_to_db the first time!
    if len(os.listdir(CHROMA_PERSIST_DIRECTORY)) == 0:
        insert_to_db(input_texts, model_name=EMBED_MODEL_NAME, cuda_idx=0, db=db)

    input_paths = np.array_split(input_texts, number_of_processes)
    with ProcessPoolExecutor(number_of_processes) as executor:
        executor.map(insert_to_db, args)
    # use cpu
    model = sentence_transformers.SentenceTransformer(EMBED_MODEL_NAME, device='cpu')
    # model = sentence_transformers.SentenceTransformer(EMBED_MODEL_NAME, device=f"cuda:0")
    query_embedding = sentence_embedding(model, query).tolist()
    results = db.query(query_embeddings=query_embedding, n_results=3)
    retrieval_content = [result for result in results['documents'][0]]
    # print(retrieval_content)
    retrieval_content = '\n'.join(retrieval_content)
    return retrieval_content

query = "What is an atom"
print(query_llm(query))

paragraph : Sentence Level For representing a document , one can split it up into sentences , with each memory slot encoding one sentence . Both the key and the value encode the entire sentence as a bag - of - words . As the key and value are the same in this case , this is identical to a standard MemNN and this approach has been used in several papers .
paragraph : Window Level Documents are split up into windows of words ; in our tasks we only include windows where the center word is an entity . Windows are represented using bag - of - words . Window representations for MemNNs have been shown to work well previously . However , in Key - Value MemNNs we encode the key as the entire window , and the value as only the center word , which is not possible in the MemNN architecture . This makes sense because the entire window is more likely to be pertinent as a match for the question ( as the key ) , whereas the entity at the center is more pertinent as a match for the answer ( as the valu

In [11]:
from dspy.predict.avatar import Tool, Avatar
from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper
from langchain.tools import BaseTool, StructuredTool, tool

def RETRIEVE(query: str) -> str:
    """If you want to search for some paper information, you can use this tool and input a natural language query. For example, RETRIEVE(\'Which method achieves the highest PCK score?\') returns relevant paper paragraph and meta data."""
    return query_llm(query)

tools = [
    Tool(
        tool=StructuredTool.from_function(RETRIEVE),
        name="RETRIEVE",
        desc="If you want to search for some paper information, you can use this tool and input a natural language query. For example, RETRIEVE('Which method achieves the highest PCK score?') returns relevant paper paragraph and meta data."
    ),
    Tool(
        tool=GoogleSerperAPIWrapper(),
        name="WEB_SEARCH",
        desc="If you have a question, you can use this tool to search the web for the answer."
    ),
    Tool(
        tool=ArxivAPIWrapper(),
        name="ARXIV_SEARCH",
        desc="Pass the arxiv paper id to get the paper information.",
        input_type="Arxiv Paper ID",
    )
]

Once we have defined our `tools`, we can now create an `Avatar` object by passing the `tools` and `signature`. It takes 2 more optional parameters `verbose` and `max_iters`. `verbose` is used to display the logs and `max_iters` is used to control the number of iterations in multi step execution. 

An avatar agent stops the tool usage iteration once it reaches `max_iters` or when it prompts `Finish`. You can also create custom tools too, all you need to make sure is:

* You pass is a class object.
* Implements `__init__` and `run` method.
* Must take 1 string a input and returns 1 string as output.

If your tool doesn't return or takes input a string then you can make a custom wrapper to take care of that for now. In future we'll try to enable a diverse tool usage.

In [12]:
actor_agent = Avatar(
    tools=tools,
    signature=ToolQASignature,
    verbose=False,
    max_iters=10
)

In [13]:
import time
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from datetime import datetime
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import copy
import tqdm
import logging
import warnings
import os

# Set up logging
# logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Disable all INFO logging
logging.getLogger().setLevel(logging.WARNING)

# Silence all loggers that might be chatty
loggers_to_silence = [
    "httpx",
    "httpcore",
    "openai",
    "arxiv",
    "dspy",
    "langchain",
    "langchain_community",
    "requests",
    "urllib3",
    "tiktoken",
    "asyncio",
    "faiss",
    "anthropic"
]

for logger_name in loggers_to_silence:
    logging.getLogger(logger_name).setLevel(logging.WARNING)

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable tokenizer parallelism warning

## Evaluation

Open enden QA tasks are hard to evaluate on rigid metrics like exact match. So, we'll be using an improvised LLM as Judge for the evaluation of our model on test set.

In [14]:
class Evaluator(dspy.Signature):
    """Please act as an impartial judge to evaluate whether the answer is correct based on the ground truth answer"""
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
    )
    reference_answer: str = dspy.InputField(
        prefix="Ground Truth Answer:",
        desc="Ground truth answer to the question.",
    )
    answer: str = dspy.InputField(
        prefix="Answer:",
        desc="Answer to the question given by the model.",
    )
    rationale: str = dspy.OutputField(
        prefix="Rationale:",
        desc="Explanation of why the answer is correct or incorrect.",
    )
    is_correct: float = dspy.OutputField(
        prefix="Correct:",
        desc="Whether the answer is correct. Give 0 if incorrect, 1 if correct, (0, 1) if partially correct.",
    )


evaluator = dspy.TypedPredictor(Evaluator)


def metric(example, prediction, trace=None):  
    # We found sometimes the ground truth answers are incomplete or the answer
    # is part of the ground truth answer. Therefore, for better comparison, 
    # we use a continuous value for the correct score   
    acc = float(
        evaluator(
            question=example.question,
            answer=prediction.answer,
            reference_answer=example.answer
        ).is_correct
    ) 
    print(prediction.answer, '|', example.answer, '=>', acc)
    return acc

print(toolqa_train[0])
metric(toolqa_train[0], prediction=dspy.Example(answer='physics'))

Example({'question': 'Which method achieves the highest PCK score on Leeds_Sports_Poses dataset for Pose_Estimation task?', 'answer': 'Pyramid_Residual_Modules__PRMs_'}) (input_keys={'question', 'paper_id'})
physics | Pyramid_Residual_Modules__PRMs_ => 0.0


0.0

For evaluation we can't use `dspy.Evaluate`, reason being that `Avatar` changes it's signature per iteration by adding the actions and it's results to it as fields. So we can create our own hacky thread safe evaluator for it.

In [15]:
import time
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from datetime import datetime
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import copy

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class APICallMetrics:
    timestamp: datetime
    tool_name: str
    tokens_in: int = 0
    tokens_out: int = 0
    execution_time: float = 0.0

@dataclass
class AvatarMetrics:
    total_tokens_in: int = 0
    total_tokens_out: int = 0
    total_execution_time: float = 0.0
    
    def add_call(self, metrics: APICallMetrics):
        self.total_tokens_in += metrics.tokens_in
        self.total_tokens_out += metrics.tokens_out
        self.total_execution_time += metrics.execution_time
    
    def merge(self, other: 'AvatarMetrics'):
        """Merge another AvatarMetrics instance into this one"""
        self.total_tokens_in += other.total_tokens_in
        self.total_tokens_out += other.total_tokens_out
        self.total_execution_time += other.total_execution_time

    def estimate_cost(self, model_name: str = "gpt-4") -> float:
        pricing = {
            "gpt-4": {"input": 2.5, "output": 10.0},
        }
        if model_name not in pricing:
            raise ValueError(f"Unknown model: {model_name}")
        
        rates = pricing[model_name]
        input_cost = (self.total_tokens_in / 1000000) * rates["input"]
        output_cost = (self.total_tokens_out / 1000000) * rates["output"]
        return input_cost + output_cost

class AvatarWithMetrics(Avatar):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.metrics = AvatarMetrics()
        self.tokenizer = tiktoken.encoding_for_model("gpt-4")
    
    def _count_tokens(self, text: str) -> int:
        try:
            return len(self.tokenizer.encode(str(text)))
        except Exception as e:
            logger.warning(f"Error counting tokens: {e}")
            return 0

    def _wrapped_tool_call(self, tool, input_text: str) -> str:
        start_time = time.time()
        tokens_in = self._count_tokens(input_text)
        
        try:
            result = tool.run(input_text)
        except Exception as e:
            logger.error(f"Tool execution error: {e}")
            raise
        finally:
            execution_time = time.time() - start_time
            tokens_out = self._count_tokens(str(result))
            
            metrics = APICallMetrics(
                timestamp=datetime.now(),
                tool_name=tool.name,
                tokens_in=tokens_in,
                tokens_out=tokens_out,
                execution_time=execution_time
            )
            self.metrics.add_call(metrics)
            
        return result

    def __call__(self, *args, **kwargs):
        start_time = time.time()
        result = super().__call__(*args, **kwargs)
        total_time = time.time() - start_time
        
        metrics = APICallMetrics(
            timestamp=datetime.now(),
            tool_name="main_llm",
            tokens_in=self._count_tokens(str(args) + str(kwargs)),
            tokens_out=self._count_tokens(str(result)),
            execution_time=total_time
        )
        self.metrics.add_call(metrics)
        
        return result

def multi_thread_executor(test_set, signature, num_threads=60):
    total_score = 0
    total_examples = len(test_set)
    combined_metrics = AvatarMetrics()

    start_time = time.time()
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for example in test_set:
            def process_with_metrics(example=example):
                try:
                    avatar = AvatarWithMetrics(signature, tools=tools, verbose=False, max_iters=10)
                    prediction = avatar(**example.inputs().toDict())
                    return metric(example, prediction), avatar.metrics
                except Exception as e:
                    print(e)
                    return 0, AvatarMetrics()

            futures.append(executor.submit(process_with_metrics))

        for future in tqdm.tqdm(futures, total=total_examples, desc="Processing examples"):
            score, metrics = future.result()
            total_score += score
            # Only combine token counts and call counts, not execution times
            combined_metrics.total_tokens_in += metrics.total_tokens_in
            combined_metrics.total_tokens_out += metrics.total_tokens_out
    
    total_execution_time = time.time() - start_time
    combined_metrics.total_execution_time = total_execution_time

    avg_metric = total_score / total_examples
    return avg_metric, combined_metrics

def single_thread_executor(test_set, signature):
    total_score = 0
    total_examples = len(test_set)
    combined_metrics = AvatarMetrics()

    for example in tqdm.tqdm(test_set, desc="Processing examples"):
        try:
            avatar = AvatarWithMetrics(signature, tools=tools, verbose=False, max_iters=10)
            prediction = avatar(**example.inputs().toDict())
            score = metric(example, prediction)
            total_score += score

        except Exception as e:
            print(e)

    avg_metric = total_score / total_examples
    return avg_metric, combined_metrics

def format_metrics_report(metrics: AvatarMetrics, model_name: str = "gpt-4") -> str:
    cost = metrics.estimate_cost(model_name)
    
    report = f"""
Avatar Execution Metrics Report
==============================
Execution Time: {metrics.total_execution_time:.2f} seconds
Total Tokens: {metrics.total_tokens_in + metrics.total_tokens_out:,} ({metrics.total_tokens_in:,} in, {metrics.total_tokens_out:,} out)
Estimated Cost: ${cost:.4f}
"""
    return report

## One-shot result

The method that achieves the highest Score score on the Atari_2600_Name_This_Game dataset for the Atari_Games task is MuZero. | IQN => 0.0


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6.EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
 | CVT___Multi-Task => 0.0
The current state-of-the-art MAP score on the WikiQA dataset is achieved by TANDA-DeBERTa-V3-Large + ALL, with a MAP score of 92%. | Key-Value_Memory_Network => 0.0
The VGG_Resnet_LACE_BiLSTM acoustic model trained on SWB+Fisher+CH is evaluated on datasets such as AMI eval, SWB/CH eval, and WSJ eval for the Speech Recognition task. | swb_hub_500_WER_fullSWBCH => 0.0
The current information available does not specify the method with the highest Parameters score on the SNLI dataset for Natural Language Inference. Further detailed research or specific academic papers may be needed to find this information. | 300D_Residual_stacked_encoders => 0.0
The 

Processing examples: 100%|██████████| 60/60 [01:40<00:00,  1.67s/it]

The PNN method is evaluated on the Bing_News dataset for the Click-Through Rate Prediction task using the Area Under the Curve (AUC) metric. | AUC, Log_Loss => 0.5





In [None]:
from typing import Any, Dict, List, Optional
import json
from new_optimizer import AvatarOptimizerWithMetrics

class ExperimentRunner:
    def __init__(self, toolqa_train, toolqa_test, actor_agent, experiment_config=None):
        self.toolqa_train = toolqa_train
        self.toolqa_test = toolqa_test
        self.actor_agent = actor_agent
        self.experiment_config = experiment_config or {}
        self.results = {
            "strategies": {},
            "timestamp": time.strftime("%Y%m%d-%H%M%S"),
            "config": experiment_config
        }
        # Cache for optimized agents and metrics
        self.optimization_cache = {}

    def _get_or_create_optimization(self, max_iters, strategy_key):
        """Helper method to get or create optimization results with caching"""
        cache_key = f"{strategy_key}_{max_iters}"
        
        if cache_key not in self.optimization_cache:
            # Create optimizer with specified parameters
            optimizer = AvatarOptimizerWithMetrics(
                metric=metric,
                max_iters=max_iters,
                max_negative_inputs=10,
                max_positive_inputs=10,
                lower_bound=0.5,
                upper_bound=0.5
            )

            actor_agent = Avatar(
                tools=tools,
                signature=ToolQASignature,
                verbose=False,
                max_iters=max_iters
            )
            # Run compilation
            result = optimizer.compile(
                student=actor_agent,
                trainset=self.toolqa_train
            )
            
            # Cache the results
            self.optimization_cache[cache_key] = {
                "agent": result["agent"],
                "metrics": result["metrics"],
                "optimizer": optimizer
            }
        
        return self.optimization_cache[cache_key]

    def run_one_shot(self) -> Dict[str, Any]:
        """Run one-shot generation strategy"""
        try:
            # Get or create optimization with 0 iterations
            opt_results = self._get_or_create_optimization(0, "one_shot")
            optimized_agent = opt_results["agent"]
            optimizer = opt_results["optimizer"]
            
            score, eval_metrics = optimizer.thread_safe_evaluator(
                self.toolqa_test, 
                optimized_agent
            )
            
            return {
                "performance": score,
                "execution_time": eval_metrics['evaluation_time'],
                "cost": eval_metrics['total_cost'],
                "status": "success"
            }
        except Exception as e:
            return {
                "status": "failed",
                "error": str(e)
            }

    def run_batch_sampling(self, batch_num: int = 4) -> Dict[str, Any]:
        """Run batch sampling strategy using the same optimization as one-shot"""
        try:
            # Use the same optimization as one-shot
            opt_results = self._get_or_create_optimization(0, "one_shot")
            optimized_agent = opt_results["agent"]
            optimizer = opt_results["optimizer"]
            
            score, batch_metrics = optimizer.thread_safe_evaluator_batch(
                self.toolqa_test, 
                optimized_agent, 
                batch_num
            )
            
            return {
                "performance": score,
                "execution_time": batch_metrics['total_execution_time'],
                "cost": batch_metrics['total_cost'],
                "best_batch_score": batch_metrics['final_score'],
                "status": "success"
            }
        except Exception as e:
            return {
                "status": "failed",
                "error": str(e)
            }

    def run_iterative_evolution(self, max_iters: int = 1) -> Dict[str, Any]:
        """Run iterative evolution strategy"""
        try:
            # Get or create optimization with specified iterations
            opt_results = self._get_or_create_optimization(max_iters, "iterative")
            optimized_agent = opt_results["agent"]
            optimizer = opt_results["optimizer"]
            optimization_metrics = opt_results["metrics"]
            
            score, eval_metrics = optimizer.thread_safe_evaluator(
                self.toolqa_test, 
                optimized_agent
            )
            
            return {
                "performance": score,
                "evaluation_time": eval_metrics['evaluation_time'],
                "execution_time": eval_metrics['evaluation_time'] + optimization_metrics['total_execution_time'],
                "cost": eval_metrics['total_cost'],
                "optimization_cost": optimization_metrics['total_cost'],
                "optimization_time": optimization_metrics['total_execution_time'],
                "iteration_details": optimization_metrics['iteration_details'],
                "status": "success"
            }
        except Exception as e:
            return {
                "status": "failed",
                "error": str(e)
            }

    def run_mixed_strategy(self, max_iters: int = 1, batch_num: int = 2) -> Dict[str, Any]:
        """Run mixed strategy using the same optimization as iterative evolution"""
        try:
            # Use the same optimization as iterative evolution
            opt_results = self._get_or_create_optimization(max_iters, "iterative")
            optimized_agent = opt_results["agent"]
            optimizer = opt_results["optimizer"]
            optimization_metrics = opt_results["metrics"]
            
            score, batch_metrics = optimizer.thread_safe_evaluator_batch(
                self.toolqa_test, 
                optimized_agent,
                batch_num
            )
            
            return {
                "performance": score,
                "execution_time": batch_metrics['total_execution_time'],
                "cost": batch_metrics['total_cost'],
                "optimization_cost": optimization_metrics['total_cost'],
                "optimization_time": optimization_metrics['total_execution_time'],
                "best_batch_score": batch_metrics['final_score'],
                "status": "success"
            }
        except Exception as e:
            return {
                "status": "failed",
                "error": str(e)
            }

    def run_experiments(self, strategies: Optional[list] = None) -> Dict[str, Any]:
        """Run specified strategies and collect results"""
        if strategies is None:
            strategies = ['one_shot', 'batch_sampling', 'iterative_evolution', 'mixed']
        
        max_iters = self.experiment_config.get('max_iters', 1)
        batch_size = self.experiment_config.get('batch_size', 2)
        
        for strategy in strategies:
            if strategy == 'one_shot':
                self.results['strategies']['one_shot'] = self.run_one_shot()
            elif strategy == 'batch_sampling':
                self.results['strategies']['batch_sampling'] = self.run_batch_sampling(batch_size)
            elif strategy == 'iterative_evolution':
                self.results['strategies']['iterative_evolution'] = self.run_iterative_evolution(max_iters)
            elif strategy == 'mixed':
                self.results['strategies']['mixed'] = self.run_mixed_strategy(max_iters, batch_size)
        
        return self.results

    def save_results(self, output_path: str):
        """Save results to a JSON file"""
        with open(output_path, 'w') as f:
            json.dump(self.results, f, indent=2)

In [29]:
experiment_config = {
        'max_iters': 0,
        'batch_size': 4,
        'experiment_name': 'toolqa_experiment'
    }
runner = ExperimentRunner(toolqa_train, toolqa_test, actor_agent, experiment_config)

In [30]:
results = runner.run_experiments(['one_shot'])


                Optimization Process Metrics
                Total Execution Time: 0.00 seconds
                Evaluation Time: 0.00 seconds
                Total API Calls: 0
                - Comparator calls: 0
                - Feedback instruction calls: 0

                Token Usage:
                ----------
                Total Tokens: 0
                - Input tokens: 0
                - Output tokens: 0

                Cost Analysis:
                ------------
                Estimated Total Cost: $0.0000
                


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The method that achieves the highest Score score on the Atari_2600_Name_This_Game dataset for the Atari_Games task is MuZero. | IQN => 0.0


Processing examples:   2%|▏         | 1/60 [00:09<09:13,  9.39s/it]

The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6. | CVT___Multi-Task => 0.0
EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The DQN_hs method is evaluated on the Atari 2600 games, as mentioned in the context of various research papers discussing reinforcement learning methods applied to Atari games. | Atari_2600_Chopper_Command => 0.0
The U-Net method for Skin Cancer Segmentation is evaluated on several datasets, including the ISIC-2018 dataset and the HAM10000 dataset. | Kaggle_Skin_Lesion_Segmentation => 0.0
The ACF-WIDER method achieves the highest AP score for the Face Detection task on the WiderFace dataset. | WIDER_Face__Easy_ => 0.0
The search did not yield specific datasets for the Deep_Speech method evaluation in Speech Recognition. Further detailed search or 

Processing examples: 100%|██████████| 60/60 [01:46<00:00,  1.77s/it]

The PNN method is evaluated on the Bing_News dataset for Click-Through Rate Prediction using metrics such as Area Under the Curve (AUC). | AUC, Log_Loss => 0.5






Evaluation Metrics Report
Execution Time: 107.25 seconds
Total Tokens: 89,477 (1,642 in, 87,835 out)
Total Cost: $0.8825
Average Score: 0.242


In [32]:
results = runner.run_experiments(['batch_sampling'])

Processing batch 1 of 4...


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6. | CVT___Multi-Task => 0.0
The method that achieves the highest Score score on the Atari_2600_Name_This_Game dataset for the Atari_Games task is MuZero. | IQN => 0.0
The Discriminative Unsupervised Feature Learning with Convolutional Neural Networks method is evaluated on the STL-10 dataset for the Image Classification task. | STL-10 => 1.0
LiteFlowNet achieves the highest Average End-Point Error score for Optical Flow Estimation on the Sintel final pass and KITTI benchmarks. | Sintel-final => 0.5
The method that achieves the highest MAP score on the WikiQA dataset for the Question Answering task is TANDA, which achieved a MAP score of 92%. | Key-Value_Memory_Network => 0.0

Processing examples: 100%|██████████| 60/60 [01:43<00:00,  1.72s/it]

The PNN method is evaluated on the Bing_News dataset for Click-Through Rate Prediction using metrics such as Area Under the Curve (AUC). | AUC, Log_Loss => 0.5





Processing batch 2 of 4...
The method that achieves the highest Score score on the Atari_2600_Name_This_Game dataset for the Atari_Games task is MuZero. | IQN => 0.0
EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6. | CVT___Multi-Task => 0.0


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The Discriminative Unsupervised Feature Learning with Convolutional Neural Networks method is evaluated on the STL-10 dataset for the Image Classification task. | CIFAR-10 => 0.0
The DQN_hs method is evaluated on the Atari 2600 games, as mentioned in the context of various research papers discussing reinforcement learning methods applied to Atari games. | Atari_2600_Chopper_Command => 0.0
The method that achieves the highest MAP score on the WikiQA dataset for the Question Answering task is HyperQA with a score of 0.712. | Key-Value_Memory_Network => 0.0
The ACF-WIDER method achieves the highest AP score for the Face Detection task on the WiderFace dataset. | WIDER_Face__Easy_ => 0.0
The novel directed hypergraph neural network method achieves the highest accuracies on the Cora dataset for the node classification task. | GCN => 0.0
The U-Net method for Skin Cancer Segmentation is evaluated on several datasets, including the ISIC-2018 dataset and the HAM10000 dataset. | Kaggle_Skin_Lesi

Processing examples: 100%|██████████| 60/60 [01:45<00:00,  1.75s/it]

The PNN method is evaluated on the Bing_News dataset for Click-Through Rate Prediction using metrics such as Area Under the Curve (AUC). | AUC, Log_Loss => 0.5





Processing batch 3 of 4...
The DQN_hs method is evaluated on the Atari 2600 games, as mentioned in the context of various research papers discussing reinforcement learning methods applied to Atari games. | Atari_2600_Chopper_Command => 0.0
The Discriminative Unsupervised Feature Learning with Convolutional Neural Networks method is evaluated on the STL-10 dataset for the Image Classification task. | CIFAR-10 => 0.0
The VGG_Resnet_LACE_BiLSTM_acoustic_model trained on SWB+Fisher+CH is evaluated on datasets such as Switchboard (SWB) and CallHome (CH) for the Speech Recognition task. | swb_hub_500_WER_fullSWBCH => 0.5
The U-Net method for Skin Cancer Segmentation is evaluated on several datasets, including the ISIC-2018 dataset and the HAM10000 dataset. | Kaggle_Skin_Lesion_Segmentation => 0.0
EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The method that achieves the highest F1 score 

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The SRCNN method for Video Super-Resolution is evaluated on datasets such as Set5, Set14, and the Timofte dataset, which are commonly used in single image super-resolution tasks. | Vid4_-_4x_upscaling => 0.0
The 3DDFA method is evaluated on the Florence dataset for 3D Face Reconstruction using the standard benchmark metric, which is the geometric error between reconstructed meshes and the ground truth. | Mean_NME_ => 0.5
LiteFlowNet achieves the highest Average End-Point Error score for Optical Flow Estimation on the Sintel final pass and KITTI benchmarks. | Sintel-final => 0.5
The method achieving the highest Number_of_params score on the WikiText-2 dataset for the Language Modelling task is not explicitly mentioned in the retrieved results. However, the paper "Improving Neural Language Modeling via Adversarial Training" reports a state-of-the-art test perplexity score of 38.07 on WikiText-2, which is a relevant performance metric for language models. | AWD-LSTM-DOC => 0.0
The method 

Processing examples:   2%|▏         | 1/60 [01:22<1:21:04, 82.45s/it]

MuZero achieves the highest Score score on the Atari 2600 Name This Game dataset for the Atari Games task with a score of 157177.85. | IQN => 0.0
The Spynet method is evaluated on the MPI-Sintel dataset for the Optical_Flow_Estimation task. | Sintel-final => 0.5
The Prior_Duel_hs method evaluation metrics for the Atari_2600_Alien dataset in the Atari_Games task could not be found in the available resources. It is recommended to check the original research paper or supplementary materials for specific evaluation metrics used. | Score => 0.0
The DPN-131 method is evaluated on several datasets for the Image Classification task, including the OSIE dataset, which is a small training dataset, and the Places365-Standard dataset, which is a high-resolution scene understanding dataset with more than 1.8 million images of 365 scene categories. | ImageNet => 0.0
The ConvNet method is evaluated on the Pascal3D dataset for the Keypoint Detection task using metrics such as Average Precision (AP) for

Processing examples:   3%|▎         | 2/60 [01:30<37:09, 38.44s/it]  

The PNN method is evaluated on the Bing_News dataset for Click-Through Rate Prediction using metrics such as accuracy and computational efficiency, as inferred from the general context of PNN evaluations. However, specific metrics for the Bing_News dataset were not explicitly found in the retrieved documents. | AUC, Log_Loss => 0.0
The DeepLab-LargeFOV method is evaluated on the SUN-RGBD dataset for Scene Segmentation using metrics such as boundary F1-measure (BF) to complement existing metrics that are more biased towards region accuracies. | Mean_IoU => 0.0
The SRCNN method is evaluated on the Manga109_-_4x_upscaling dataset using metrics such as PSNR (Peak Signal-to-Noise Ratio) and IFC (Information Fidelity Criterion). | PSNR, SSIM => 0.5
The DRCN method is evaluated on the Set5 dataset for 4x upscaling using the PSNR (Peak Signal-to-Noise Ratio) metric. The DRCN method's performance is compared to other methods, and it achieves higher PSNR values, indicating better image quality. 

Processing examples:  10%|█         | 6/60 [01:48<11:02, 12.27s/it]

The Impatient_Reader method is evaluated on the CNN/Daily Mail dataset for the Question_Answering task using accuracy as the primary metric. The performance is measured by the proportion of test cases where the ground truth is among the top answers proposed by the model. | CNN, Daily_Mail => 0.5


Processing examples:  52%|█████▏    | 31/60 [01:50<00:47,  1.64s/it]

The FDNet method is evaluated on the WIDER_FACE Easy dataset for the Face Detection task using metrics such as precision and recall, achieving a 95.9% accuracy on the easy set. | AP => 0.0


Processing examples:  63%|██████▎   | 38/60 [01:52<00:28,  1.28s/it]

The CRN method for Image-to-Image Translation task is not specifically mentioned in the retrieved documents. Therefore, I cannot provide the datasets it was evaluated on. | ADE20K-Outdoor_Labels-to-Photos => 0.0
The current state-of-the-art method for the Yelp Binary classification dataset for Sentiment Analysis is XLNet. | Char-level_CNN => 0.0


Processing examples:  67%|██████▋   | 40/60 [01:52<00:23,  1.18s/it]

The available resources did not provide the specific dataset on which the DDQN__tuned__noop method achieves the highest Score score for the Atari_Games task. Further detailed research or access to specific experimental results may be required to obtain this information. | Atari_2600_Video_Pinball => 0.0


Processing examples: 100%|██████████| 60/60 [01:54<00:00,  1.90s/it]

The method EASE achieves the highest Recall_50 score of 0.428 on the Million Song Dataset for the Collaborative Filtering task. | Mult-VAE_PR => 0.0





Processing batch 4 of 4...


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The method that achieves the highest MAP score on the WikiQA dataset for the Question Answering task is HyperQA with a score of 0.712. | Key-Value_Memory_Network => 0.0
The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6. | CVT___Multi-Task => 0.0
The SRCNN method for Video Super-Resolution is evaluated on datasets such as Set5, Set14, and the Timofte dataset, which are commonly used in single image super-resolution tasks. | Vid4_-_4x_upscaling => 0.0
The method achieving the highest Number_of_params score on the WikiText-2 dataset for the Language Modelling task is not explicitly mentioned in the retrieved results. However, the paper "Improving Neural Language Modeling via Adversarial Training" reports a state-of-the-art te

Processing examples:   2%|▏         | 1/60 [01:41<1:40:09, 101.86s/it]

MuZero achieves the highest Score score on the Atari_2600_Name_This_Game dataset for the Atari_Games task. | IQN => 0.0
The NICE method is evaluated on the CIFAR-10 dataset for the Image Generation task using metrics such as Inception score and bits per dimension (perplexity). | NLL_Test => 0.0
The DPN-131 method is evaluated on several datasets for the Image Classification task, including the OSIE dataset, ImageNet, and Places365-Standard dataset. | ImageNet => 0.5


Processing examples:   3%|▎         | 2/60 [01:43<41:30, 42.93s/it]   

The PNN method is evaluated on the Bing_News dataset for Click-Through Rate Prediction using metrics such as accuracy and computational efficiency, as inferred from the general context of PNN evaluations. However, specific metrics for the Bing_News dataset were not explicitly found in the retrieved documents. | AUC, Log_Loss => 0.0
The method that achieves the highest error score on the Yelp Binary classification dataset for the Sentiment Analysis task is a shallow word model, which establishes a state-of-the-art performance with an accuracy of 95.9%. | Char-level_CNN => 0.0
The FDNet method is evaluated on the WIDER_Face Easy dataset for the Face Detection task using metrics such as precision and recall, achieving 95.9% on the easy set. | AP => 0.0


Processing examples:  25%|██▌       | 15/60 [01:44<02:46,  3.70s/it]

The Stacked Hourglass Networks achieve the highest PCK_0_2 score for the Pose Estimation task on the MPII dataset. | FLIC_Elbows => 0.0


Processing examples: 100%|██████████| 60/60 [01:45<00:00,  1.77s/it]

The CRN method datasets for the Image-to-Image Translation task were not found in the available resources. Further specific research or access to the original paper detailing the CRN method might be necessary to obtain this information. | ADE20K-Outdoor_Labels-to-Photos => 0.0






Batch Evaluation Metrics Report
Total Execution Time: 459.88 seconds
Average Time per Batch: 114.97 seconds
Best Score: 0.242 (Batch 1)
Total Tokens: 372,073 (6,568 in, 365,505 out)
Total Cost: $3.6715

Per-Batch Performance:
--------------------

Batch 1:
  Score: 0.242
  Execution Time: 108.73s
  Tokens: 93,297 (1,642 in, 91,655 out)
  Cost: $0.9207

Batch 2:
  Score: 0.208
  Execution Time: 117.48s
  Tokens: 92,540 (1,642 in, 90,898 out)
  Cost: $0.9131

Batch 3:
  Score: 0.217
  Execution Time: 122.70s
  Tokens: 94,888 (1,642 in, 93,246 out)
  Cost: $0.9366

Batch 4:
  Score: 0.217
  Execution Time: 110.97s
  Tokens: 91,348 (1,642 in, 89,706 out)
  Cost: $0.9012


In [33]:
runner.results

{'strategies': {'one_shot': {'performance': 0.24166666666666667,
   'execution_time': 107.25170588493347,
   'cost': 0.882455,
   'status': 'success'},
  'batch_sampling': {'performance': 0.24166666666666667,
   'execution_time': 459.88402676582336,
   'cost': 3.6714700000000002,
   'best_batch_score': 0.24166666666666667,
   'status': 'success'}},
 'timestamp': '20241207-201811',
 'config': {'max_iters': 0,
  'batch_size': 4,
  'experiment_name': 'toolqa_experiment'}}

In [36]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable tokenizer parallelism warning

score, metrics = multi_thread_executor(toolqa_test, ToolQASignature)

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6. | CVT___Multi-Task => 0.0
The VGG_Resnet_LACE_BiLSTM acoustic model trained on SWB+Fisher+CH is evaluated on datasets such as AMI eval, SWB/CH eval, and WSJ eval for the Speech Recognition task. | swb_hub_500_WER_fullSWBCH => 0.0
EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The ACF-WIDER method achieves the highest AP score for the Face Detection task on the WiderFace dataset. | WIDER_Face__Easy_ => 0.0
The 3DDFA method is evaluated on the Florence dataset for 3D Face Reconstruction using the standard benchmark metric, which is the geometric error between reconstructed meshes and the ground truth. | Mean_NME_ => 0.5
The Paragraph_vector method for the Question Answering task has been evaluated on datasets such as SQu

Processing examples:   2%|▏         | 1/60 [01:26<1:24:49, 86.27s/it]

The NICE method is evaluated on the CIFAR-10 dataset for the Image Generation task using metrics such as Inception score and bits per dimension (perplexity). | NLL_Test => 0.0
The current state-of-the-art method achieving the highest score on the Atari 2600 Name This Game dataset for the Atari Games task is MuZero. | IQN => 0.0
The Field-gating Seq2seq dual attention method is evaluated on the WikiBio dataset using metrics such as BLEU. The method outperforms competitive baselines by a significant margin, with the dual attention mechanism boosting performance by over 1 BLEU compared to the vanilla attention mechanism. | BLEU, ROUGE => 0.5
The method EASE achieves the highest Recall_50 score of 0.428 on the Million Song Dataset for the Collaborative Filtering task. | Mult-VAE_PR => 0.0
The AWD-LSTM-DOC method is evaluated on the WikiText-2 dataset for the Language Modelling task using the metric of perplexity. | Number_of_params, Test_perplexity, Validation_perplexity => 0.5
The availab

Processing examples:   3%|▎         | 2/60 [01:50<48:01, 49.68s/it]  

The PNN method is evaluated on the Bing_News dataset for Click-Through Rate Prediction using the Area Under the Curve (AUC) metric. | AUC, Log_Loss => 0.5


Processing examples: 100%|██████████| 60/60 [01:51<00:00,  1.86s/it]

The ConvNet method for Keypoint Detection on the Pascal3D dataset is evaluated using the Percentage of Correct Keypoints (PCK) metric and the area under the PCK-over-alpha curve. | Mean_PCK => 0.5





In [38]:
# print(f"Average Score on ArxivQA before opitmization: {aqa_score:.2f}")
print(f"Test Score: {score:.2f}")
print(format_metrics_report(metrics))

Test Score: 0.24

Avatar Execution Metrics Report
Execution Time: 113.34 seconds
Total Tokens: 94,600 (1,702 in, 92,898 out)
Estimated Cost: $0.9332



In [35]:
# Pure batch sampling

from new_optimizer import AvatarOptimizerWithMetrics

batch_sampling_monkey = AvatarOptimizerWithMetrics(
    metric=metric,
    max_iters=0,
    max_negative_inputs=10,
    max_positive_inputs=10,
    lower_bound=0.5,
    upper_bound=0.5
)
result = batch_sampling_monkey.compile(
    student=actor_agent,
    trainset=toolqa_train
)
batch_num = 2
batch_sampling_monkey.thread_safe_evaluator_batch(toolqa_test, result['agent'], batch_num)


                Optimization Process Metrics
                Total Execution Time: 0.00 seconds
                Evaluation Time: 0.00 seconds
                Total API Calls: 0
                - Comparator calls: 0
                - Feedback instruction calls: 0

                Token Usage:
                ----------
                Total Tokens: 0
                - Input tokens: 0
                - Output tokens: 0

                Cost Analysis:
                ------------
                Estimated Total Cost: $0.0000
                
Processing batch 1 of 2...


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6. | CVT___Multi-Task => 0.0
EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The VGG_Resnet_LACE_BiLSTM acoustic model trained on SWB+Fisher+CH is evaluated on datasets such as AMI eval, SWB/CH eval, and WSJ eval for the Speech Recognition task. | swb_hub_500_WER_fullSWBCH => 0.0
The DQN_hs method is evaluated on the Atari 2600 games, as mentioned in the context of various research papers discussing reinforcement learning methods applied to Atari games. | Atari_2600_Chopper_Command => 0.0
The Discriminative Unsupervised Feature Learning with Convolutional Neural Networks method is evaluated on the STL-10 dataset for the Image Classification task. | CIFAR-10 => 0.0
The method that achieves the highest error score on the Yel

Processing examples:   2%|▏         | 1/60 [01:38<1:36:28, 98.11s/it]

MuZero achieves the highest Score score on the Atari_2600_Name_This_Game dataset for the Atari_Games task. | IQN => 0.0
The VAT_EntMin method for Semi-Supervised Image Classification is evaluated on the MNIST, SVHN, and CIFAR-10 datasets. | CIFAR-10__4000_Labels => 0.0
The ConvNet method is evaluated on the Pascal3D dataset for the Keypoint Detection task using metrics such as Average Precision (AP) for detection and Average Viewpoint Precision (AVP) for joint detection and pose estimation. | Mean_PCK => 0.0
The 300D_NTI-SLSTM-LSTM_encoders method is evaluated on the Stanford Natural Language Inference (SNLI) dataset for the Natural Language Inference task. | SNLI => 1.0
The DeepLab-LargeFOV method is evaluated on the SUN-RGBD dataset for Scene Segmentation using metrics such as boundary F1-measure (BF) to complement existing metrics that are more biased towards region accuracies. | Mean_IoU => 0.0
The SRCNN method is evaluated on the Manga109_-_4x_upscaling dataset using metrics such 

Processing examples: 100%|██████████| 60/60 [01:56<00:00,  1.95s/it] 

The PNN method is evaluated on the Bing_News dataset for Click-Through Rate Prediction using metrics such as accuracy and computational efficiency, as inferred from the general context of PNN evaluations. However, specific metrics for the Bing_News dataset were not explicitly found in the retrieved documents. | AUC, Log_Loss => 0.0





Processing batch 2 of 2...
The VGG_Resnet_LACE_BiLSTM acoustic model trained on SWB+Fisher+CH is evaluated on datasets such as AMI eval, SWB/CH eval, and WSJ eval for the Speech Recognition task. | swb_hub_500_WER_fullSWBCH => 0.0
EffNet-L2 (SAM) achieves the highest Percentage_correct score of 96.08 on the CIFAR-100 dataset for the Image Classification task. | Res2NeXt-29 => 0.0
The method that achieves the highest F1 score on the CoNLL 2003 English dataset for Named Entity Recognition (NER) is ACE + document-context with an F1 score of 94.6. | CVT___Multi-Task => 0.0


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The method that achieves the highest Score score on the Atari_2600_Robotank dataset for the Atari_Games task is MuZero with a score of 131.13. | Bootstrapped_DQN => 0.0
The SRCNN method for Video Super-Resolution is evaluated on datasets such as Set5, Set14, and the Timofte dataset, which are commonly used in single image super-resolution tasks. | Vid4_-_4x_upscaling => 0.0
The Stacked Hourglass Networks method achieves the highest PCK_0_2 score for the Pose Estimation task on the FLIC dataset, with a score of 97.0%. | FLIC_Elbows => 0.0
The U-Net method for Skin Cancer Segmentation is evaluated on several datasets, including the ISIC-2018 dataset and the HAM10000 dataset. | Kaggle_Skin_Lesion_Segmentation => 0.0
The TANDA model achieves the highest MAP score on the WikiQA dataset for the Question Answering task, with a MAP score of 92%. | Key-Value_Memory_Network => 0.0
The evaluation metrics for the Prior_Duel_hs method on the Atari_2600_Alien dataset for the Atari_Games task are not

Processing examples:   2%|▏         | 1/60 [01:38<1:37:03, 98.70s/it]

The Impatient_Reader method is evaluated on the CNN/Daily Mail dataset for the Question_Answering task using accuracy as the primary metric. The performance is measured by the proportion of test cases where the ground truth is among the top answers proposed by the model. | CNN, Daily_Mail => 0.5
MuZero achieves the highest score on the Atari 2600 Name This Game dataset for the Atari Games task with a score of 157177.85. | IQN => 0.0
The DPN-131 method is evaluated on datasets such as the RVL-CDIP dataset, Tobacco-3482 dataset, and Places365-Standard dataset for the Image Classification task. | ImageNet => 0.0
The DeepLab-LargeFOV method is evaluated on the SUN-RGBD dataset for Scene Segmentation using metrics such as accuracy and boundary F1-measure (BF). These metrics are used to assess the performance of segmentation architectures, focusing on both region accuracies and boundary precision. | Mean_IoU => 0.0
The DR-BiLSTM (Single) model achieves the highest Train Accuracy score on the

Processing examples: 100%|██████████| 60/60 [01:49<00:00,  1.83s/it] 

The PNN method is evaluated on the Bing_News dataset for the Click-Through Rate Prediction task using the Area Under the Curve (AUC) metric. | AUC, Log_Loss => 0.5






Batch Evaluation Metrics Report
Total Execution Time: 240.05 seconds
Average Time per Batch: 120.03 seconds
Best Score: 0.217 (Batch 1)
Total Tokens: 183,748 (3,284 in, 180,464 out)
Total Cost: $1.8129

Per-Batch Performance:
--------------------

Batch 1:
  Score: 0.217
  Execution Time: 119.03s
  Tokens: 91,710 (1,642 in, 90,068 out)
  Cost: $0.9048

Batch 2:
  Score: 0.217
  Execution Time: 121.02s
  Tokens: 92,038 (1,642 in, 90,396 out)
  Cost: $0.9081


(0.21666666666666667,
 {'batches': [{'batch_id': 1,
    'score': 0.21666666666666667,
    'execution_time': 119.02766513824463,
    'tokens_in': 1642,
    'tokens_out': 90068,
    'cost': 0.904785},
   {'batch_id': 2,
    'score': 0.21666666666666667,
    'execution_time': 121.02484273910522,
    'tokens_in': 1642,
    'tokens_out': 90396,
    'cost': 0.908065}],
  'total_execution_time': 240.05250787734985,
  'total_tokens_in': 3284,
  'total_tokens_out': 180464,
  'best_batch': 1,
  'total_cost': 1.81285,
  'final_score': 0.21666666666666667,
  'average_batch_time': 120.02625393867493})

## Optimization

For the optimization of the `Actor` we'll be using `AvatarOptimizer`. It's a DSPy implementation of the [Avatar](https://github.com/zou-group/avatar/) method that optimizes the `Actor` for the given `tools` using a comparator module that optimizes Actor instruction. Note, that Actor is the Module that directs tool execution and flow, it's not the signature that we are passing. It doesn't optimize the instruction of the signature we pass. It takes the following parameters:

* `metric`: Metric that we'll be optimizing for
* `max_iters`: Maximum number of iterations for the optimizer
* `lower_bound`: Lower bound for the metric to classify example as negative
* `upper_bound`: Upper bound for the metric to classify example as positive
* `max_positive_inputs`: Maximum number of positive inputs sampled for comparator
* `max_negative_inputs`: Maximum number of negative inputs sampled for comparator
* `optimize_for`: Whether we want to maximize the metric or minimize it during optimization

Once the optimizer is done we can get the optimized actor and use it for the evaluation.

In [20]:
from new_optimizer import AvatarOptimizerWithMetrics

iterative_monkey = AvatarOptimizerWithMetrics(
    metric=metric,
    max_iters=1,
    max_negative_inputs=10,
    max_positive_inputs=10,
    lower_bound=0.5,
    upper_bound=0.5
)

In [21]:
result = iterative_monkey.compile(
    student=actor_agent,
    trainset=toolqa_train
)

Processing examples:   0%|          | 0/40 [00:00<?, ?it/s]

The method that achieves the highest SSIM score on the Vid4 - 4x upscaling dataset for Video Super-Resolution is EvTexture+ with an SSIM score of 0.8983. | VESPCN => 0.0
OmniPose achieves the highest PCK score of 99.5% on the Leeds Sports Poses dataset for the Pose Estimation task. | Pyramid_Residual_Modules__PRMs_ => 0.0


Processing examples:   2%|▎         | 1/40 [00:09<05:51,  9.02s/it]

The highest BLEU score on the WMT2014 English-German dataset for Machine Translation is 35.14, achieved by the Transformer Cycle (Rev) model. | Weighted_Transformer__large_ => 0.0
The method that achieves the highest Medium_Human-Normalized_Score on the Atari-57 dataset for Atari Games is LBC with a score of 10077.52%. | Ape-X => 0.0
The A3C-CTS method is evaluated on the whole Atari 2600 suite, including Montezuma's Revenge and Bellemare et al.'s set of hard exploration games with sparse rewards. | Atari_2600_Venture => 0.0
The IQN method is evaluated on 57 Atari 2600 games in the ALE (Atari Learning Environment). | Atari_2600_Kung-Fu_Master => 0.5
The Bi-LSTM trained on FCE method achieves the highest F0.5 score on the FCE dataset for the Grammatical Error Detection task, as indicated by the result from the paper by Masahiro Kaneko and Mamoru Komachi. | CoNLL-2014_A2 => 0.0
The ByteNet method is evaluated on the English-to-German WMT translation task for Machine Translation. | WMT201

Processing examples:   5%|▌         | 2/40 [00:50<17:43, 28.00s/it]

The Duel_noop method is evaluated on 57 Atari games for the Atari_Games task. | Atari_2600_Time_Pilot => 0.0
The Mult-DAE method is evaluated on the Netflix dataset using the NDCG@100 metric for the Collaborative Filtering task. | Recall_20, Recall_50 => 0.0
The TARNet method is evaluated on the semi-synthetic IHDP dataset and the Jobs dataset, which includes both a randomized and a non-randomized component, for the Causal Inference task. | IDHP => 0.5
The Duel_hs method is evaluated on the 57 Atari games dataset, which includes a variety of games used for benchmarking reinforcement learning algorithms. | Atari_2600_Video_Pinball => 0.0
The BiDAF___Self_Attention__single_model_ method is evaluated on the SQuAD and CNN/DailyMail datasets for the Question Answering task. | SQuAD1_1 => 0.0
The DeepFM method achieves the highest Log_Loss score for the Click-Through Rate Prediction task on the Criteo dataset. The Criteo dataset is a well-known ad tech industry benchmarking dataset used for 

Processing examples:  10%|█         | 4/40 [00:56<07:25, 12.38s/it]

The DDQN__tuned__noop method is evaluated on 57 Atari games. | Atari_2600_Berzerk => 0.0
The Duel_noop method is evaluated on 57 Atari games for the Atari_Games task. | Atari_2600_Ms__Pacman => 0.0
The LapSRN method is evaluated on the Urban100 - 4x upscaling dataset using Peak Signal-to-Noise Ratio (PSNR) and Structural Similarity Index (SSIM) as the quality metrics. | PSNR => 0.5
The IDE____CamStyle method is evaluated on the PRID2011, iLIDS-VID, and VIPeR datasets for the Person Re-Identification task. | DukeMTMC-reID => 0.0
The DANN method is evaluated on the Multi-Domain Sentiment Dataset using classification accuracy as the primary metric. The dataset includes Amazon reviews from four domains, and the evaluation involves 12 domain adaptation tasks. The DANN method is compared against a standard neural network and a Support Vector Machine, with DANN showing significantly better performance in terms of classification accuracy. | Average, Books, DVD, Electronics, Kitchen => 0.0
The 

Processing examples:  15%|█▌        | 6/40 [00:59<04:04,  7.19s/it]

The PFF method for Image Super-Resolution is evaluated on the Set5 and Set14 datasets. | Set14_-_4x_upscaling => 0.5
The ResNet_ELU method is evaluated on the CIFAR-100 dataset using metrics such as test error percentage. The ELU networks achieved a test error of 24.28%, which is among the best results reported for CIFAR-100. | Percentage_correct => 0.5
The Subgraph_embeddings method for the Question Answering task on the WebQuestions dataset is evaluated using a scoring function S(q, a), which generates a high score if a is the correct answer to the question q, and a low score otherwise. The method involves learning low-dimensional vector embeddings of words in questions and entities and relation types in Freebase, ensuring that representations of questions and their corresponding answers are close in the joint embedding space. | F1 => 0.0
The OICR-Ens___FRCNN method is evaluated on the PASCAL_VOC_2012 dataset using the standard PASCAL VOC protocol, which reports average precision (AP

Processing examples: 100%|██████████| 40/40 [01:10<00:00,  1.76s/it]

The DDQN__tuned__hs method is evaluated on the Atari 2600 Games task, which involves training an agent to achieve high game scores across various Atari games. However, specific datasets or additional details about the evaluation were not found in the available resources. | Atari_2600_Assault => 0.0






Evaluation Metrics Report
Execution Time: 74.23 seconds
Total Tokens: 56,232 (1,090 in, 55,142 out)
Total Cost: $0.5541
Average Score: 0.225
Average Score: 0.225
Evaluation Cost: $0.5541
Generated new instruction: To effectively accomplish the `Goal` using the provided `Tools`, begin by carefully analyzing the user query to determine the most appropriate tool for the task. Retain the flexibility to use no tools if the answer can be directly provided. When selecting a tool, prioritize specificity in your queries. For instance, when using the `RETRIEVE` tool, ensure that your queries are precise and directly related to the task, incorporating specific keywords or phrases that are likely to appear in the relevant sections of papers or datasets. This will enhance the relevance and accuracy of the outputs.

In cases where the `RETRIEVE` tool does not yield satisfactory results, consider broadening your approach by utilizing `WEB_SEARCH` or `ARXIV_SEARCH`. These tools can provide a wider co

In [22]:
optimized_actor_agent = result["agent"]
optimization_metrics = result["metrics"]

# Now you can process the metrics
print(f"Total optimization cost: ${optimization_metrics['total_cost']:.4f}")
print(f"Final score achieved: {optimization_metrics['final_score']:.3f}")

# Analyze per-iteration performance
for iteration in optimization_metrics['iteration_details']:
    print(f"\nIteration {iteration['iteration']}:")
    print(f"Score: {iteration['score']:.3f}")
    print(f"Comparator tokens in: {iteration['comparator_metrics']['tokens_in']}")
    print(f"Comparator tokens out: {iteration['comparator_metrics']['tokens_out']}")
    print(f"Feedback tokens in: {iteration['feedback_metrics']['tokens_in']}")
    print(f"Feedback tokens out: {iteration['feedback_metrics']['tokens_out']}")
    print(f"Execution time: {iteration['total_iteration_time']:.2f}s")

Total optimization cost: $1.0108
Final score achieved: 0.225

Iteration 0:
Score: 0.225
Comparator tokens in: 31172
Comparator tokens out: 560
Feedback tokens in: 707
Feedback tokens out: 347
Execution time: 87.30s


Now we can evaluate our actor module, for this we've provided an implementation of thread safe evaluator that we above as part of class method of `AvatarOptimizer`.

In [23]:
score, one_shot_metrics = iterative_monkey.thread_safe_evaluator(toolqa_test, optimized_actor_agent)

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The Inception_V2 method is evaluated on the ImageNet dataset for the Image Classification task using top-1 and top-5 error rates on the validation set. | Top_1_Accuracy, Top_5_Accuracy => 1.0
The Transformer method is evaluated on the WMT 2014 English-to-German and English-to-French translation tasks for the Machine Translation task. | IWSLT2015_English-German => 0.0
The S-Norm method is evaluated on the TREC QA dataset, SQuAD, and TriviaQA for the Question Answering task. | TriviaQA => 0.0
The U-Net method for Skin Cancer Segmentation is evaluated on datasets such as the ISIC-2016, ISIC-2017, and ISIC-2018 skin lesion datasets. | Kaggle_Skin_Lesion_Segmentation => 0.0
The Deep Speech method is evaluated on datasets such as the TIMIT Acoustic-Phonetic Continuous Speech Corpus and LibriSpeech for the Speech Recognition task. | Switchboard___Hub500 => 0.0
The Snips method for Speech Recognition is evaluated on datasets such as the TIMIT Acoustic-Phonetic Continuous Speech Corpus and the 

Processing examples:   2%|▏         | 1/60 [03:40<3:37:04, 220.75s/it]

Agent57 is the method that achieves the highest score on the Atari_2600_Name_This_Game dataset for the Atari_Games task. | IQN => 0.0
The Impatient_Reader method is evaluated on the CNN/Daily Mail dataset for the Question Answering task using accuracy as a metric. The method achieved an accuracy of 63.8%. | CNN, Daily_Mail => 0.5
The highest Train_Accuracy score on the SNLI dataset for the Natural Language Inference task is not explicitly available from the current search results. However, the state-of-the-art models on SNLI, such as Neural Tree Indexers, are known for high performance, but specific Train_Accuracy scores are not detailed in the available data. | __Unigram_and_bigram_features => 0.0
The ConvNet method for Keypoint Detection on the Pascal3D dataset is evaluated using the Percentage of Correct Keypoints (PCK) metric. | Mean_PCK => 0.5
The CRN method for Image-to-Image Translation does not have specific datasets mentioned in the retrieved results. The searches did not yiel

Processing examples:   3%|▎         | 2/60 [03:55<1:35:58, 99.29s/it] 

The Field-gating Seq2seq with dual attention method is evaluated on the WikiBio dataset using BLEU, ROUGE, and PARENT metrics. | BLEU, ROUGE => 0.5
The PNN method is evaluated on the Bing_News dataset for the Click-Through Rate Prediction task using the following metrics: Area Under the ROC Curve (AUC), Relative Information Gain (RIG), Log Loss, and Root Mean Square Error (RMSE). | AUC, Log_Loss => 0.5
The MemNNs__ensemble_ method is evaluated on the SQuAD, WikiQA, and MS-MARCO datasets for the Question Answering task. | CNN___Daily_Mail => 0.0
The DQN_noop method is evaluated on 57 Atari games, as indicated by the evaluation of various algorithms including DQN under noop start settings across all 57 games. | Atari_2600_River_Raid => 0.0
The DPN-131 method is evaluated on the ImageNet dataset for the Image Classification task. | ImageNet => 1.0


Processing examples: 100%|██████████| 60/60 [04:07<00:00,  4.12s/it] 

The DDQN__tuned__noop method achieves the highest Score score for the Atari_Games task on the Atari 2600 Breakout dataset with a score of 368.9. | Atari_2600_Video_Pinball => 0.0






Evaluation Metrics Report
Execution Time: 249.45 seconds
Total Tokens: 171,031 (1,580 in, 169,451 out)
Total Cost: $1.6985
Average Score: 0.295


In [24]:
print(f"\nIterative evolution summary:")
print(f"Score: {one_shot_metrics['average_score']:.3f}")
print(f"Total Cost: ${one_shot_metrics['total_cost']:.4f}")
print(f"Total Time: {one_shot_metrics['execution_time']:.2f}s")


Iterative evolution summary:
Score: 0.295
Total Cost: $1.6985
Total Time: 249.45s


In [25]:
# iterative_monkey.thread_safe_evaluator(toolqa_test, optimized_actor_agent)
batch_num = 2
score, batch_metrics = iterative_monkey.thread_safe_evaluator_batch(
    toolqa_test, 
    optimized_actor_agent,
    batch_num
)

Processing batch 1 of 2...


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The IDE_CamStyle_Random_Erasing method is evaluated on the Market-1501, DukeMTMC-reID, and CUHK03 datasets for the Person Re-Identification task. | Market-1501 => 0.5
The 300D_NTI-SLSTM-LSTM_encoders method is evaluated on the Stanford Natural Language Inference (SNLI) dataset for the Natural Language Inference task. | SNLI => 1.0
The U-Net method for Skin Cancer Segmentation is evaluated on datasets such as the ISIC-2016, ISIC-2017, and ISIC-2018 skin lesion datasets. | Kaggle_Skin_Lesion_Segmentation => 0.0
1 validation error for ActionOutput
tool_output
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.5/v/string_type
The Stacked Hourglass Networks method achieves state-of-the-art results on the MPII and FLIC datasets for the Pose Estimation task, but specific PCK_0_2 scores are not mentioned in the available resources. | FLIC_Elbows => 0.0
The DQN_hs method is evaluated on the A

Processing examples:   2%|▏         | 1/60 [02:58<2:55:45, 178.74s/it]

Agent57 achieves the highest score on the Atari_2600_Name_This_Game dataset for the Atari_Games task. | IQN => 0.0
The S-Norm method is evaluated on the TREC QA dataset, SQuAD, and TriviaQA for the Question Answering task. | TriviaQA => 0.0
The method achieving the highest Number_of_params score on the WikiText-2 dataset for the Language Modelling task is not explicitly mentioned in the retrieved results. However, the Byte mLSTM model was noted to have 46 million parameters, which is larger than most word-level models on this dataset. | AWD-LSTM-DOC => 0.0
The information about the SVDCNN method achieving the highest error score for the Sentiment Analysis task on a specific dataset is not available in the current search results. | Yelp_Fine-grained_classification => 0.0
The VAT_EntMin method is evaluated on the MNIST, SVHN, and CIFAR-10 datasets for the Semi-Supervised Image Classification task. | CIFAR-10__4000_Labels => 0.5
The SRCNN method for Video Super-Resolution is evaluated on 

Processing examples:   3%|▎         | 2/60 [03:55<1:43:19, 106.88s/it]

The PNN method is evaluated on the Bing_News dataset for the Click-Through Rate Prediction task using the following metrics: Area Under the ROC Curve (AUC), Relative Information Gain (RIG), Log Loss, and Root Mean Square Error (RMSE). | AUC, Log_Loss => 0.5


Processing examples:  40%|████      | 24/60 [03:55<03:18,  5.50s/it]  

The MemNNs__ensemble_ method is evaluated on the SQuAD, WikiQA, and MS-MARCO datasets for the Question Answering task. | CNN___Daily_Mail => 0.0
The Field-gating Seq2seq dual attention method is evaluated on the WikiBio dataset using metrics such as BLEU, ROUGE, and PARENT. | BLEU, ROUGE => 0.5


Processing examples:  50%|█████     | 30/60 [03:57<02:02,  4.09s/it]

The DeepLab-LargeFOV method is evaluated on the SUN-RGBD dataset for the Scene Segmentation task using metrics such as pixel accuracy and mean Intersection over Union (mIoU). | Mean_IoU => 0.5
The DRCN method is evaluated on the Set5 dataset for 4x upscaling in the Image Super-Resolution task using metrics such as Peak Signal-to-Noise Ratio (PSNR) and Structural Similarity Index (SSIM). | MOS, PSNR, SSIM => 0.67
The highest Mean_IoU score for the CamVid dataset in the Semantic Segmentation task is not clearly identified in the available data. Further specific research or access to updated databases might be required to find the most recent and highest score. | PSPNet => 0.0
The dataset on which the DDQN__tuned__noop method achieves the highest Score score for the Atari_Games task is not explicitly found in the available data. Further specific dataset information might be required from the original research or dataset documentation. | Atari_2600_Video_Pinball => 0.0


Processing examples: 100%|██████████| 60/60 [04:08<00:00,  4.14s/it]

The CRN method for Image-to-Image Translation is evaluated on datasets such as Cityscapes and GTA5. | ADE20K-Outdoor_Labels-to-Photos => 0.0





Processing batch 2 of 2...


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

The CyCADA method is evaluated on the SYNTHIA_Fall-to-Winter dataset for Image-to-Image Translation using metrics such as mIoU (mean Intersection over Union), fwIoU (frequency weighted Intersection over Union), and Pixel accuracy. | Per-pixel_Accuracy, fwIOU, mIoU => 1.0
The S-Norm method is evaluated on the TREC QA dataset, SQuAD, and TriviaQA for the Question Answering task. | TriviaQA => 0.0
The method achieving the highest Number_of_params score on the WikiText-2 dataset for the Language Modelling task is not explicitly found in the retrieved data. However, the Byte mLSTM model mentioned in the RETRIEVE results has 46 million parameters, which is larger than most word-level models on this dataset. Further specific details might require more targeted research or access to updated datasets and papers. | AWD-LSTM-DOC => 0.0
The TANDA method achieves the highest MAP score of 92% on the WikiQA dataset for the Question Answering task. | Key-Value_Memory_Network => 0.0
The U-Net method fo

Processing examples:   2%|▏         | 1/60 [02:36<2:33:32, 156.14s/it]

MuZero achieves the highest score on the Atari_2600_Name_This_Game dataset for the Atari_Games task. | IQN => 0.0
Unable to find specific information on the highest Train_Accuracy score on the SNLI dataset for the Natural Language Inference task. The available data mostly focuses on test accuracy and state-of-the-art models without specifying train accuracy. | __Unigram_and_bigram_features => 0.0
The Impatient_Reader method is evaluated on the CNN/Daily Mail dataset for the Question Answering task using metrics such as accuracy. However, specific evaluation metrics like Exact Match or F1 score were not explicitly found in the search results. The Impatient Reader achieved an accuracy of 63.8% on this dataset. | CNN, Daily_Mail => 0.5
The 300D_NTI-SLSTM-LSTM_encoders method is evaluated on the Stanford Natural Language Inference (SNLI) dataset for the Natural Language Inference task. | SNLI => 1.0
The Snips method for Speech Recognition is evaluated on the TIMIT dataset and the Hey-Snips

Processing examples:   3%|▎         | 2/60 [03:43<1:40:15, 103.72s/it]

The CRN method for Image-to-Image Translation does not have specific datasets mentioned in the available resources. Further research or direct access to the original research papers might be required to obtain this information. | ADE20K-Outdoor_Labels-to-Photos => 0.0
The PNN method is evaluated on the Bing_News dataset for the Click-Through Rate Prediction task using the following metrics: Area Under the ROC Curve (AUC), Relative Information Gain (RIG), Log Loss, and Root Mean Square Error (RMSE). | AUC, Log_Loss => 0.5
The highest Error score method for the Yelp Binary classification Sentiment Analysis task is not explicitly mentioned in the available data. However, XLNet is noted as the current state-of-the-art model for this task. | Char-level_CNN => 0.0


Processing examples:  40%|████      | 24/60 [03:44<03:13,  5.38s/it]  

The MemNNs__ensemble_ method is evaluated on the SQuAD, WikiQA, and MS-MARCO datasets for the Question Answering task. | CNN___Daily_Mail => 0.0
The DDQN__tuned__noop method achieves the highest Score score for the Atari_Games task on the Atari_2600_Video_Pinball dataset. | Atari_2600_Video_Pinball => 1.0
The FDNet method is evaluated on the WIDER Face Easy dataset using the metric of Average Precision (AP), achieving a score of 95.9%. | AP => 1.0
The DRCN method is evaluated on the Set5 - 4x upscaling dataset for the Image Super-Resolution task using the PSNR (Peak Signal-to-Noise Ratio) metric. | MOS, PSNR, SSIM => 0.33
The DQN_noop method is evaluated on the 57 Atari games dataset, which includes a variety of games from the Atari 2600 suite. This evaluation typically involves using noop starts as part of the testing procedure. | Atari_2600_River_Raid => 0.0
The Field-gating Seq2seq with dual attention method is evaluated on the WikiBio dataset using metrics such as BLEU, ROUGE, and 

Processing examples:  47%|████▋     | 28/60 [03:53<02:32,  4.75s/it]

The SVDCNN method for text classification is evaluated on datasets such as AG's News, Yelp Review Polarity, and DBpedia Ontology. | AG_News => 0.5


Processing examples:  72%|███████▏  | 43/60 [03:53<00:39,  2.32s/it]

The SRCNN method for Video Super-Resolution is evaluated on datasets such as Set5, Set14, and Urban100. These datasets are commonly used for benchmarking super-resolution techniques. | Vid4_-_4x_upscaling => 0.0


Processing examples: 100%|██████████| 60/60 [04:13<00:00,  4.23s/it]

The highest Mean_IoU score on the CamVid dataset for Semantic Segmentation is not clearly identified in the available data. The search did not yield a definitive result for 2023. Further research or access to specific datasets or publications may be required to obtain the latest results. | PSPNet => 0.0






Batch Evaluation Metrics Report
Total Execution Time: 540.03 seconds
Average Time per Batch: 270.01 seconds
Best Score: 0.303 (Batch 1)
Total Tokens: 346,164 (3,130 in, 343,034 out)
Total Cost: $3.4382

Per-Batch Performance:
--------------------

Batch 1:
  Score: 0.303
  Execution Time: 268.07s
  Tokens: 172,353 (1,580 in, 170,773 out)
  Cost: $1.7117

Batch 2:
  Score: 0.289
  Execution Time: 271.96s
  Tokens: 173,811 (1,550 in, 172,261 out)
  Cost: $1.7265


In [26]:
print(f"\nMixed strategy summary:")
print(f"Best Score: {batch_metrics['final_score']:.3f}")
print(f"Total Cost: ${batch_metrics['total_cost']:.4f}")
print(f"Total Time: {batch_metrics['total_execution_time']:.2f}s")


Mixed strategy summary:
Best Score: 0.303
Total Cost: $3.4382
Total Time: 540.03s
