In [3]:
HF_USR_NAME = 'shirwu'
TOOL_QA_ROOT = ''

### Upload to Huggingface

In [4]:
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict

level = 'easy'
dataset = 'agenda'

dataset_dir = f'{dataset}-{level}.jsonl'
hf_dataset_name = f'toolqa_{dataset}_{level}'

df = pd.read_json(dataset_dir, lines=True)
df.head()

df['answer'] = df['answer'].apply(lambda x: str(x))
dataset = Dataset.from_pandas(df)

In [5]:
dataset_dict = DatasetDict({'train': dataset})
# push to hf for the ease for using dspy
# dataset_dict.push_to_hub(repo_id=hf_dataset_name, private=True)

## Setting Up

* ToolQA

Before loading our datasets and going to the execution part, we'll need to configure the `lm` in `dspy.settings`. For the purpose of this notebook we'll be using `gpt-4o`.

In [6]:
import os
import dspy
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)


dspy.settings.configure(
    lm=dspy.OpenAI(
        model="gpt-4o",
        api_key=os.getenv("OPENAI_API_KEY"),
        max_tokens=2048,
        temperature=0.6
    )
)

## Defining Signature

In [7]:
class ToolQASignature(dspy.Signature):
    """You will be given a question. Your task is to answer the question with a short response. For example, (1) Question: \'How many dates in the agenda table have Alexander scheduled?\' => Answer: \'73\' (2) Question: \'What events does Jade have on 2022/01/25 in the agenda table?\' => Answer: \'Kids concert\'
    """
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
        format=lambda x: x.strip(),
    )
    answer: str = dspy.OutputField(
        prefix="Answer:",
        desc="answer to the question",
    )


## Loading Datasets

In [8]:
from random import sample
from dspy.datasets import DataLoader

dl = DataLoader()

In [9]:
tool_qa = dl.from_huggingface(
    f'{HF_USR_NAME}/' + hf_dataset_name,
    split="train",
    input_keys=("question", "answer"),
)

Downloading readme:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.37k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
len(tool_qa)

100

In [11]:
import random
# set seed
random.seed(42)

train_idx = random.sample(range(len(tool_qa)), 40)
remaining_idx = list(set(range(len(tool_qa))) - set(train_idx))
test_idx = random.sample(remaining_idx, 60)

toolqa_train = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in train_idx]
]
toolqa_test = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in test_idx]
]

## Setting Up Tools

We'll setup `Avatar` modules for both signatures and all the `tools` can be used by each of the dataset. `Tool` is a pydantic model that Avatar expects the `tools` to be composed as more specifically it have 4 fields:

* `name` : Name of the tool
* `input_type` : Type of input the tool accepts
* `output_type` : Type of output the tool returns
* `tool` : The actual tool object

In [None]:
import os
import time
import uuid
import numpy as np
import jsonlines
from concurrent.futures import ProcessPoolExecutor
import sentence_transformers
import chromadb
import os.path as osp
from chromadb.config import Settings

EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CHROMA_PERSIST_DIRECTORY = osp.join(TOOL_QA_ROOT, "data/chroma_db/agenda")
CHROMA_COLLECTION_NAME = "all"
CHROMA_SERVER_HOST = "localhost"
CHROMA_SERVER_HTTP_PORT = "8000"
FILE_PATH = osp.join(TOOL_QA_ROOT, "data/external_corpus/agenda/agenda_descriptions_merged.jsonl")

def sentence_embedding(model, texts):
    embeddings = model.encode(texts)
    return embeddings

def create_chroma_db(chroma_server_host, chroma_server_http_port, collection_name):
    chroma_client = chromadb.Client(Settings(
        chroma_api_impl="rest",
        chroma_server_host=chroma_server_host,
        chroma_server_http_port=chroma_server_http_port,
    ))
    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def create_chroma_db_local(persist_directory, collection_name):
    
    chroma_client = chromadb.PersistentClient(path=persist_directory)

    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def insert_to_db(texts, model_name, cuda_idx, db):
    model = sentence_transformers.SentenceTransformer(model_name, device=f"cuda:{cuda_idx}")

    batch_embeddings = []
    batch_texts = []
    start_time = time.time()
    print(f"Total Articles to process: {len(texts)}, Current Thread: {cuda_idx}.")
    for i, text in enumerate(texts):
        # 2. generate embedding
        embeddings = sentence_embedding(model, text).tolist()

        batch_embeddings.append(embeddings)
        batch_texts.append(text)
        # 3. add to vectorstore per 500 articles or last article
        if i % 100 == 0 or i == len(texts)-1:
            batch_ids = [str(uuid.uuid1()) for _ in batch_texts]
            db.add(
                embeddings=batch_embeddings,
                documents=batch_texts,
                ids = batch_ids
            )
            batch_embeddings = []
            batch_texts = []
            print(f"Completed Processing article count: {i}, Current Thread: {cuda_idx}, Time took: {time.time() - start_time}.")
    print(f"Thread {cuda_idx} Completed. Total time took for thread: {time.time() - start_time}.")


# Multi-processing
def query_llm(query, is_local=True, start=None, end=None):
    EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
    CHROMA_PERSIST_DIRECTORY = osp.join(TOOL_QA_ROOT, "data/chroma_db/agenda")
    os.makedirs(CHROMA_PERSIST_DIRECTORY, exist_ok=True)
    CHROMA_COLLECTION_NAME = "all"
    CHROMA_SERVER_HOST = "localhost"
    CHROMA_SERVER_HTTP_PORT = "8000"
    FILE_PATH = osp.join(TOOL_QA_ROOT, "data/external_corpus/agenda/agenda_descriptions_merged.jsonl")

    cuda_idxes = [0]
    number_of_processes = len(cuda_idxes)
    input_texts = []
    db = create_chroma_db_local(CHROMA_PERSIST_DIRECTORY, CHROMA_COLLECTION_NAME)
    with open(FILE_PATH, 'r') as f:
        for item in jsonlines.Reader(f):
            input_texts.append(item["event"])
    # input_texts = np.array_split(input_texts, number_of_processes)

    args = ((input_texts[i], EMBED_MODEL_NAME, cuda_idxes[i], is_local) for i in range(number_of_processes))

    # if there is no file under the directory "/localscratch/yzhuang43/ra-llm/retrieval_benchmark/data/chroma_db/agenda", insert the data into the db
    if len(os.listdir(CHROMA_PERSIST_DIRECTORY)) == 0:
        insert_to_db(input_texts, model_name=EMBED_MODEL_NAME, cuda_idx=0, db=db)

    input_paths = np.array_split(input_texts, number_of_processes)
    with ProcessPoolExecutor(number_of_processes) as executor:
        executor.map(insert_to_db, args)
    model = sentence_transformers.SentenceTransformer(EMBED_MODEL_NAME, device=f"cuda:0")
    query_embedding = sentence_embedding(model, query).tolist()
    results = db.query(query_embeddings=query_embedding, n_results=5)
    retrieval_content = [result for result in results['documents'][0]]
    # print(retrieval_content)
    retrieval_content = '\n\n'.join(retrieval_content)
    print('[query_llm]', query)
    return retrieval_content

query = "What is the Jessica's agenda on March 7th, 2023?"
print(query_llm(query))

[query_llm] What is the Jessica's agenda on March 7th, 2023?
Jessica has a business meeting on January 22nd, 2022 at The Pacific Design Center. The meeting is scheduled to start at 3:00 PM and end at 4:00 PM.

Jessica is scheduled to attend a Business Meeting on May 24th, 2022 at the Hilton Hotel. The meeting is set to commence at 10:00 AM and end at 11:00 AM. It is expected that attendees will discuss important business matters and make decisions. The Hilton Hotel will be the venue where the meeting will take place, and it is expected that all attendees should be punctual and prepared for the agenda items.

Jessica will be attending a dinner event on April 6th, 2022 at a fancy restaurant called The Steakhouse at the Four Seasons. The event will begin at 7:00 PM and end at 10:00 PM. It promises to be a luxurious night filled with delicious cuisine and great company.

Jessica will be attending a dinner event on April 6th, 2022 at a fancy restaurant called The Steakhouse at the Four Seas

In [13]:
from dspy.predict.avatar import Tool, Avatar
from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper
from langchain.tools import BaseTool, StructuredTool, tool

def RETRIEVE(query: str) -> str:
    """If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."""
    return query_llm(query)

tools = [
    Tool(
        tool=StructuredTool.from_function(RETRIEVE),
        name="RETRIEVE",
        desc="If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."
    )
]

Once we have defined our `tools`, we can now create an `Avatar` object by passing the `tools` and `signature`. It takes 2 more optional parameters `verbose` and `max_iters`. `verbose` is used to display the logs and `max_iters` is used to control the number of iterations in multi step execution. 

An avatar agent stops the tool usage iteration once it reaches `max_iters` or when it prompts `Finish`. You can also create custom tools too, all you need to make sure is:

* You pass is a class object.
* Implements `__init__` and `run` method.
* Must take 1 string a input and returns 1 string as output.

If your tool doesn't return or takes input a string then you can make a custom wrapper to take care of that for now. In future we'll try to enable a diverse tool usage.

In [14]:
actor_agent = Avatar(
    tools=tools,
    signature=ToolQASignature,
    verbose=False,
    max_iters=10
)

In [15]:
import time
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from datetime import datetime
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import copy
import tqdm
import logging
import warnings
import os

# Set up logging
# logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Disable all INFO logging
logging.getLogger().setLevel(logging.WARNING)

# Silence all loggers that might be chatty
loggers_to_silence = [
    "httpx",
    "httpcore",
    "openai",
    "arxiv",
    "dspy",
    "langchain",
    "langchain_community",
    "requests",
    "urllib3",
    "tiktoken",
    "asyncio",
    "faiss",
    "anthropic"
]

for logger_name in loggers_to_silence:
    logging.getLogger(logger_name).setLevel(logging.WARNING)

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable tokenizer parallelism warning

## Evaluation

Open enden QA tasks are hard to evaluate on rigid metrics like exact match. So, we'll be using an improvised LLM as Judge for the evaluation of our model on test set.

In [16]:
class Evaluator(dspy.Signature):
    """Please act as an impartial judge to evaluate whether the answer is correct based on the ground truth answer"""
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
    )
    reference_answer: str = dspy.InputField(
        prefix="Ground Truth Answer:",
        desc="Ground truth answer to the question.",
    )
    answer: str = dspy.InputField(
        prefix="Answer:",
        desc="Answer to the question given by the model.",
    )
    rationale: str = dspy.OutputField(
        prefix="Rationale:",
        desc="Explanation of why the answer is correct or incorrect.",
    )
    is_correct: float = dspy.OutputField(
        prefix="Correct:",
        desc="Whether the answer is correct. Give 0 if incorrect, 1 if correct, (0, 1) if partially correct.",
    )


evaluator = dspy.TypedPredictor(Evaluator)


def metric(example, prediction, trace=None):
    acc = float(
        evaluator(
            question=example.question,
            answer=prediction.answer,
            reference_answer=example.answer
        ).is_correct
    ) 
    print(prediction.answer, '|', example.answer, '=>', acc)
    return acc

In [17]:
print(toolqa_train[0])
metric(toolqa_train[0], prediction=dspy.Example(answer='73 days'))

Example({'question': 'What did Georgina do from 1:00 PM to 11:00 PM on 2022/05/18?', 'answer': 'Oktoberfest'}) (input_keys={'paper_id', 'question'})
73 days | Oktoberfest => 0.0


0.0

For evaluation we can't use `dspy.Evaluate`, reason being that `Avatar` changes it's signature per iteration by adding the actions and it's results to it as fields. So we can create our own hacky thread safe evaluator for it.

In [18]:
import time
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from datetime import datetime
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import copy

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class APICallMetrics:
    timestamp: datetime
    tool_name: str
    tokens_in: int = 0
    tokens_out: int = 0
    execution_time: float = 0.0

@dataclass
class AvatarMetrics:
    total_calls: int = 0
    total_tokens_in: int = 0
    total_tokens_out: int = 0
    total_execution_time: float = 0.0
    calls_by_tool: Dict[str, int] = field(default_factory=dict)
    api_call_history: List[APICallMetrics] = field(default_factory=list)
    
    def add_call(self, metrics: APICallMetrics):
        self.total_calls += 1
        self.total_tokens_in += metrics.tokens_in
        self.total_tokens_out += metrics.tokens_out
        self.total_execution_time += metrics.execution_time
        self.calls_by_tool[metrics.tool_name] = self.calls_by_tool.get(metrics.tool_name, 0) + 1
        self.api_call_history.append(metrics)
    
    def merge(self, other: 'AvatarMetrics'):
        """Merge another AvatarMetrics instance into this one"""
        self.total_calls += other.total_calls
        self.total_tokens_in += other.total_tokens_in
        self.total_tokens_out += other.total_tokens_out
        self.total_execution_time += other.total_execution_time
        for tool, count in other.calls_by_tool.items():
            self.calls_by_tool[tool] = self.calls_by_tool.get(tool, 0) + count
        self.api_call_history.extend(other.api_call_history)

    def estimate_cost(self, model_name: str = "gpt-4") -> float:
        pricing = {
            "gpt-4": {"input": 2.5, "output": 10.0},
        }
        if model_name not in pricing:
            raise ValueError(f"Unknown model: {model_name}")
        
        rates = pricing[model_name]
        input_cost = (self.total_tokens_in / 1000000) * rates["input"]
        output_cost = (self.total_tokens_out / 1000000) * rates["output"]
        return input_cost + output_cost

class AvatarWithMetrics(Avatar):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.metrics = AvatarMetrics()
        self.tokenizer = tiktoken.encoding_for_model("gpt-4")
    
    def _count_tokens(self, text: str) -> int:
        try:
            return len(self.tokenizer.encode(str(text)))
        except Exception as e:
            logger.warning(f"Error counting tokens: {e}")
            return 0

    def _wrapped_tool_call(self, tool, input_text: str) -> str:
        start_time = time.time()
        tokens_in = self._count_tokens(input_text)
        
        try:
            result = tool.run(input_text)
        except Exception as e:
            logger.error(f"Tool execution error: {e}")
            raise
        finally:
            execution_time = time.time() - start_time
            tokens_out = self._count_tokens(str(result))
            
            metrics = APICallMetrics(
                timestamp=datetime.now(),
                tool_name=tool.name,
                tokens_in=tokens_in,
                tokens_out=tokens_out,
                execution_time=execution_time
            )
            self.metrics.add_call(metrics)
            
        return result

    def __call__(self, *args, **kwargs):
        start_time = time.time()
        result = super().__call__(*args, **kwargs)
        total_time = time.time() - start_time
        
        metrics = APICallMetrics(
            timestamp=datetime.now(),
            tool_name="main_llm",
            tokens_in=self._count_tokens(str(args) + str(kwargs)),
            tokens_out=self._count_tokens(str(result)),
            execution_time=total_time
        )
        self.metrics.add_call(metrics)
        
        return result

def multi_thread_executor(test_set, signature, num_threads=60):
    total_score = 0
    total_examples = len(test_set)
    combined_metrics = AvatarMetrics()

    start_time = time.time()
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for example in test_set:
            def process_with_metrics(example=example):
                try:
                    avatar = AvatarWithMetrics(signature, tools=tools, verbose=False, max_iters=10)
                    prediction = avatar(**example.inputs().toDict())
                    return metric(example, prediction), avatar.metrics
                except Exception as e:
                    print(e)
                    return 0, AvatarMetrics()

            futures.append(executor.submit(process_with_metrics))

        for future in tqdm.tqdm(futures, total=total_examples, desc="Processing examples"):
            score, metrics = future.result()
            total_score += score
            # Only combine token counts and call counts, not execution times
            combined_metrics.total_calls += metrics.total_calls
            combined_metrics.total_tokens_in += metrics.total_tokens_in
            combined_metrics.total_tokens_out += metrics.total_tokens_out
            for tool, count in metrics.calls_by_tool.items():
                combined_metrics.calls_by_tool[tool] = combined_metrics.calls_by_tool.get(tool, 0) + count
            combined_metrics.api_call_history.extend(metrics.api_call_history)
    
    total_execution_time = time.time() - start_time
    combined_metrics.total_execution_time = total_execution_time

    avg_metric = total_score / total_examples
    return avg_metric, combined_metrics

def single_thread_executor(test_set, signature):
    total_score = 0
    total_examples = len(test_set)
    combined_metrics = AvatarMetrics()

    for example in tqdm.tqdm(test_set, desc="Processing examples"):
        try:
            avatar = AvatarWithMetrics(signature, tools=tools, verbose=False, max_iters=10)
            prediction = avatar(**example.inputs().toDict())
            score = metric(example, prediction)
            total_score += score
            # Combine metrics from this run
            for call in avatar.metrics.api_call_history:
                combined_metrics.add_call(call)
        except Exception as e:
            print(e)

    avg_metric = total_score / total_examples
    return avg_metric, combined_metrics

def format_metrics_report(metrics: AvatarMetrics, model_name: str = "gpt-4") -> str:
    cost = metrics.estimate_cost(model_name)
    
    report = f"""
Avatar Execution Metrics Report
==============================
Execution Time: {metrics.total_execution_time:.2f} seconds
Total API Calls: {metrics.total_calls}
Total Tokens: {metrics.total_tokens_in + metrics.total_tokens_out:,} ({metrics.total_tokens_in:,} in, {metrics.total_tokens_out:,} out)
Estimated Cost: ${cost:.4f}

Average Time per Call: {metrics.total_execution_time / metrics.total_calls:.2f} seconds

Tool Usage Breakdown:
-------------------
"""
    for tool, count in sorted(metrics.calls_by_tool.items()):
        report += f"{tool}: {count} calls\n"

    return report

In [19]:
score, metrics = multi_thread_executor(toolqa_test, ToolQASignature)

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] What did Adam do from 7:00 PM to 9:00 PM on 2022/10/27?
[query_llm] Who attended Art exhibit opening reception between 5:00 PM and 7:00 PM on 2022/08/01 in Locust Projects, Design District?
[query_llm] Alice's activities from 6:00 PM to 9:00 PM on 2022/07/28
[query_llm] Floral design session location for Emily on 2022/09/30
[query_llm] Aidan's events on 2022/12/09
[query_llm] Who attended Poetry reading between 7:00 PM and 9:00 PM on 2022/04/07 in Beyond Baroque?
[query_llm] When did Georgia attend Move-in day on 2022/06/22?
[query_llm] What did Daisy do from 11:00 PM to 1:00 AM on 2022/02/01?
[query_llm] Henry's attendance duration at the Chili cook-off on 2022/03/21
[query_llm] James' activities from 9:00 AM to 11:00 AM on 2022/02/02
[query_llm] Attendees of Wine Tasting event between 5:00 PM and 7:00 PM on 2022/05/23 in City Winery
[query_llm] What did Thomas do from 8:00 PM to 10:00 PM on 2022/10/03?
[query_llm] Brian's attendance duration at Happy Hour on 2022/06/20
[q

Processing examples:   2%|▏         | 1/60 [00:31<30:36, 31.13s/it]

Aidan did not attend the Bird-watching tour on 2022/12/09. | 8:00 AM => 0.0
Brian attended Happy Hour on 2022/06/20 for 2 hours. | 2:00:00 => 1.0
Thomas had no scheduled events from 8:00 PM to 10:00 PM on 2022/10/03. | Star-gazing => 0.0
Henry attended the Chili cook-off for 3 hours on 2022/03/21. | 3:00:00 => 1.0
Adam does not have any listed activities from 7:00 PM to 9:00 PM on 2022/10/27. | Join a public speaking class => 0.0
Daisy had no scheduled events from 11:00 PM to 1:00 AM on 2022/02/01. | Late Night Movie => 0.0
The available information does not mention attendees for a Wine Tasting event between 5:00 PM and 7:00 PM on 2022/05/23 in City Winery. | Penelope => 0.0
Georgia attended Move-in day on June 22nd, 2022, starting at 9:00 AM and ending at 5:00 PM. | 9:00 AM => 1.0
Georgina attended the Aquarium visit between 10:00 AM and 5:00 PM on 2022/07/06 in New England Aquarium. | Georgina => 1.0
No information available for Matilda's activities from 7:30 PM to 9:30 PM on 2022/03

Processing examples:   3%|▎         | 2/60 [01:37<50:11, 51.92s/it]

The rooftop of The Standard Hotel | The Standard Hotel rooftop  => 1.0
Madison attended a stand-up comedy show on October 11th, 2022, at the Comedy Cellar from 8:00 PM to 10:00 PM. | 8:00 PM => 1.0
3 hours | 3:00:00 => 1.0
3 hours | 3:00:00 => 1.0
Chloe attended the Photography Class between 11:00 AM and 1:00 PM on 2022/03/09 at The Camera Store. | Chloe => 1.0
Poppy attended Star Gazing between 8:00 PM and 11:00 PM on 2022/08/23 at Griffith Park Observatory. | Adrian => 0.0
The Robotics seminar that Esme attended on 2022/09/13 took place at the University Auditorium. | University Auditorium => 1.0
William did not attend a Music concert on 2022/01/17. | 8:00 PM => 0.0
Brian attended a car maintenance check at Quick Lube on June 30th, 2022, from 9:00 AM to 11:00 AM. | 9:00 AM => 1.0
Elizabeth does not have any events scheduled from 7:30 PM to 10:00 PM on 2022/06/29. | Opera performance => 0.0
George attended the tennis lesson for 1 hour on 2022/06/22. | 1:00:00 => 1.0
Albert attended th

Processing examples: 100%|██████████| 60/60 [01:41<00:00,  1.69s/it]

Faith did not attend an Ice hockey game on 2022/10/04. | 7:00 PM => 0.0





In [20]:
# print(f"Average Score on ArxivQA before opitmization: {aqa_score:.2f}")
print(f"Test Score: {score:.2f}")
print(format_metrics_report(metrics))

Test Score: 0.55

Avatar Execution Metrics Report
Execution Time: 102.03 seconds
Total API Calls: 60
Total Tokens: 31,888 (1,609 in, 30,279 out)
Estimated Cost: $0.3068

Average Time per Call: 1.70 seconds

Tool Usage Breakdown:
-------------------
main_llm: 60 calls



## Optimization

For the optimization of the `Actor` we'll be using `AvatarOptimizer`. It's a DSPy implementation of the [Avatar](https://github.com/zou-group/avatar/) method that optimizes the `Actor` for the given `tools` using a comparator module that optimizes Actor instruction. Note, that Actor is the Module that directs tool execution and flow, it's not the signature that we are passing. It doesn't optimize the instruction of the signature we pass. It takes the following parameters:

* `metric`: Metric that we'll be optimizing for
* `max_iters`: Maximum number of iterations for the optimizer
* `lower_bound`: Lower bound for the metric to classify example as negative
* `upper_bound`: Upper bound for the metric to classify example as positive
* `max_positive_inputs`: Maximum number of positive inputs sampled for comparator
* `max_negative_inputs`: Maximum number of negative inputs sampled for comparator
* `optimize_for`: Whether we want to maximize the metric or minimize it during optimization

Once the optimizer is done we can get the optimized actor and use it for the evaluation.

In [21]:
from new_optimizer import AvatarOptimizerWithMetrics

iterative_monkey = AvatarOptimizerWithMetrics(
    metric=metric,
    max_iters=1,
    max_negative_inputs=20,
    max_positive_inputs=20,
)

In [22]:
result = iterative_monkey.compile(
    student=actor_agent,
    trainset=toolqa_train
)

Processing examples:   0%|          | 0/40 [00:00<?, ?it/s]

[query_llm] Who attended the Boxing match between 7:00 PM and 10:00 PM on 2022/11/06 in Madison Square Garden?
[query_llm] Who attended Horse race between 2:00 PM and 5:00 PM on 2022/08/14 in Santa Anita Park?
[query_llm] Natalie's attendance duration for Volunteer work on 2022/03/01
[query_llm] Charity auction location that Willow attended on 2022/06/25
[query_llm] Where did Farmers Market Visit that Molly attended take place on 2022/03/05?
[query_llm] Faith's attendance at Book club meeting on 2022/03/26
[query_llm] Graduation ceremony attendees between 5:00 PM and 7:00 PM on 2022/10/22 in Stadium, University of North Carolina
[query_llm] Culinary festival location for Adrian on 2022/09/27
[query_llm] Charlotte's attendance duration for Piano Recital on 2022/01/30
[query_llm] Thomas attended Artisanal Pizza Making on 2022/10/28
[query_llm] Millie's attendance at the Wine education seminar on 2022/06/11
[query_llm] Where did Intermediate Spanish lesson that Alice attended take place o

Processing examples:  38%|███▊      | 15/40 [00:52<01:02,  2.48s/it]

No events found for Georgina from 1:00 PM to 11:00 PM on 2022/05/18. | Oktoberfest => 0.0
The Poetry Foundation | The Poetry Foundation => 1.0
Grace did not attend a Broadway Show on 2022/02/17. | 8:00 PM => 0.0
Answer: Spa Day at The Beverly Hills Hotel Spa from 1:00 PM to 4:00 PM on June 19, 2022. | Roller skating => 0.0


Processing examples:  60%|██████    | 24/40 [00:54<00:23,  1.46s/it]

There is no available information on what Ava did from 12:00 PM to 4:00 PM on 2022/06/16. | Cooking competition => 0.0


Processing examples:  82%|████████▎ | 33/40 [00:55<00:06,  1.11it/s]

Amelia had no scheduled activities from 07:00 PM to 09:00 PM on 2022/08/07. | Language Exchange => 0.0


Processing examples: 100%|██████████| 40/40 [00:58<00:00,  1.46s/it]

Layla attended the pottery throwing class on April 24th, 2022, from 10:00 AM to 12:00 PM at The Potter's Wheel. | 10:00 AM => 1.0





Average Score: 0.5875
Generated new instruction: I'm here to help you create improved instructions for the group, incorporating the feedback provided. Here's a revised set of instructions:

---

New Instruction: You will be given `Tools`, which will be a list of resources to use in order to accomplish the `Goal`. When presented with a user query, your task is to decide which tool to use and what input values to provide. You will output the necessary `Action` to achieve the `Goal`, including the tool to use and the input query for the tool. Note: You can choose not to use any tools and provide the final answer directly. You may also use one tool multiple times with different input queries if applicable.

To improve performance on broader queries, break down these queries into specific, manageable components. For example, if asked, "What did Georgina do from 1:00 PM to 11:00 PM on 2022/05/18?", consider breaking the timeframe into smaller segments or focusing on specific activities withi

In [None]:
optimized_actor_agent = result["agent"]
optimization_metrics = result["metrics"]

# Now you can process the metrics
print(f"Total optimization cost: ${optimization_metrics['total_cost']:.4f}")
print(f"Final score achieved: {optimization_metrics['final_score']:.3f}")

# Analyze per-iteration performance
for iteration in optimization_metrics['iteration_details']:
    print(f"\nIteration {iteration['iteration']}:")
    print(f"Score: {iteration['score']:.3f}")
    print(f"Comparator tokens in: {iteration['comparator_metrics']['tokens_in']}")
    print(f"Comparator tokens out: {iteration['comparator_metrics']['tokens_out']}")
    print(f"Feedback tokens in: {iteration['feedback_metrics']['tokens_in']}")
    print(f"Feedback tokens out: {iteration['feedback_metrics']['tokens_out']}")
    print(f"Execution time: {iteration['total_iteration_time']:.2f}s")

Total optimization cost: $0.6839
Final score achieved: 0.588

Iteration 0:
Score: 0.588
Comparator tokens in: 20244
Comparator tokens out: 565
Feedback tokens in: 720
Feedback tokens out: 352
Execution time: 76.10s


Now we can evaluate our actor module, for this we've provided an implementation of thread safe evaluator that we above as part of class method of `AvatarOptimizer`.

In [None]:
batch_num = 4
iterative_monkey.thread_safe_evaluator_batch(toolqa_test, optimized_actor_agent, batch_num)

Processing batch 1 of 4...


Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] Activities of Daisy from 11:00 PM on 2022/01/31 to 1:00 AM on 2022/02/01
[query_llm] Faith's schedule on 2022/10/04
[query_llm] Millie's attendance duration at Theatre play on 2022/12/07
[query_llm] Retrieve activities for Thomas from 8:00 PM to 10:00 PM on 2022/10/03.
[query_llm] Find the time Albert attended the Youth talent showcase on 2022/08/11.
[query_llm] Brian's activities on 2022/06/20, specifically focusing on the duration of Happy Hour attendance.
[query_llm] Events for Phoebe on 2022/01/10
[query_llm] Attendees of Aquarium visit between 10:00 AM and 5:00 PM on 2022/07/06 at New England Aquarium
[query_llm] James' activities from 9:00 AM to 11:00 AM on 2022/02/02
[query_llm] Georgia's schedule on 2022/06/22
[query_llm] What time did Aidan attend the Bird-watching tour on 2022/12/09?
[query_llm] Daisy's attendance details for Jazz Performance on 2022/02/05
[query_llm] Fetch the duration of Aurora's attendance at the Dreamcatcher-making workshop on 2022/01/24.
[que

Processing examples:   2%|▏         | 1/60 [00:41<40:26, 41.13s/it]

Thomas had no scheduled activities from 8:00 PM to 10:00 PM on 2022/10/03. | Star-gazing => 0.0
No relevant activities found for Adam from 7:00 PM to 9:00 PM on 2022/10/27. | Join a public speaking class => 0.0
Answer: The attendees of the Poetry reading at Beyond Baroque on April 7th, 2022, were not explicitly listed in the retrieved information. | Charlotte => 0.0
[query_llm] Benjamin's events on 2022/10/21
[query_llm] Attendees of Star Gazing event between 8:00 PM and 11:00 PM on 2022/08/23 at Observatory, Griffith Park Observatory
[query_llm] Isabella's schedule on 2022/05/06
[query_llm] Felix's schedule on 2022/03/30 for Live piano performance
Benjamin did not attend Movie night on 2022/10/21 as per available information. | 8:30 PM => 0.0
Adrian and Poppy attended the Star Gazing event between 8:00 PM and 11:00 PM on 2022/08/23 at Griffith Park Observatory. | Adrian => 0.5
Isabella's schedule does not mention a Photography workshop on 2022/05/06. | 12:00 PM => 0.0
[query_llm] Alic

Processing examples:   3%|▎         | 2/60 [01:06<30:34, 31.63s/it]

3 hoursGarden Collage | Garden Collage => 1.0
 | 3:00:00 => 1.0
The Movie Night that Hector attended on 2022/06/03 took place on the rooftop of The Standard Hotel. | The Standard Hotel rooftop  => 1.0
[query_llm] Attendees of Historical lecture between 7:00 PM and 9:00 PM on 2022/11/29 at The New-York Historical Society
No specific attendees were mentioned for the Wine Tasting event at City Winery between 5:00 PM and 7:00 PM on 2022/05/23. | Penelope => 0.0
[query_llm] Robotics seminar location for Esme on 2022/09/13
[query_llm] Matilda's activities between 7:30 PM and 9:30 PM on 2022/03/25
[query_llm] Phillip's attendance at Creative Writing Workshop on 2022/06/28
[query_llm] Retrieve the duration of Daniel's Dance Class attendance on 2022/06/24.
[query_llm] Albert's attendance duration for Conference Call on 2022/04/24
[query_llm] Fetch Adam's attendance details for the Breakfast meeting on 2022/05/07.
[query_llm] Isabella's activities between 2:00 PM and 3:00 PM on 2022/12/03
[query

Processing examples:  10%|█         | 6/60 [01:09<07:04,  7.86s/it]

Georgia attended Move-in day on 2022/06/22 from 9:00 AM to 5:00 PM. | 9:00 AM => 1.0
University Auditorium | University Auditorium => 1.0
No relevant events were found for Charlotte from 4:00 PM to 5:30 PM on 2022/10/31.No attendees found for the specified event timeframe and location. | Henry => 0.0
 | Cooking Class => 0.0
Daisy attended the Public Policy Conference for 8 hours on February 25, 2022. | 8:00:00 => 1.0
No relevant activities found for Matilda between 7:30 PM and 9:30 PM on 2022/03/25. | Opera show => 0.0
Albert attended the Conference Call for 1 hour on 2022/04/24. | 1:00:00 => 1.0
Answer: Daniel's Dance Class duration on 2022/06/24 is not available in the retrieved data. | 1:00:00 => 0.0
Answer: Adam attended the Breakfast meeting on 2022/05/07 for 1 hour and 30 minutes. | 1:30:00 => 1.0
No events or activities found for Grace from 8:00 PM to 9:30 PM on 2022/08/01. | Stand-up comedy show => 0.0
Georgina's attendance at the Corporate Social Responsibility Conference on 2

Processing examples:  18%|█▊        | 11/60 [01:12<03:01,  3.70s/it]

Isabella's activities between 2:00 PM and 3:00 PM on 2022/12/03 are not listed. No specific events found for this timeframe. | Ballet class => 0.0
Henry attended the Chili cook-off for 3 hours on 2022/03/21. | 3:00:00 => 1.0
The location of the Fun run event that Zara attended on 2022/02/19 is not available in the retrieved data. | The Fun Run Trail => 0.0
Phillip attended the Creative Writing Workshop on June 28th, 2022, from 6:00 PM to 8:00 PM at The StoryStudio. | 6:00 PM => 1.0


Processing examples: 100%|██████████| 60/60 [01:15<00:00,  1.25s/it]

Georgina attended the Indie film festival on April 30th, 2022, from 12:00 PM to 10:00 PM at the Los Angeles Film Festival. | 12:00 PM => 1.0
Processing batch 2 of 4...



Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] Find location of Movie Night attended by Hector on 2022/06/03.
The Movie Night that Hector attended on 2022/06/03 took place on the rooftop of The Standard Hotel. | The Standard Hotel rooftop  => 1.0
[query_llm] What events or activities did Grace have from 8:00 PM to 9:30 PM on 2022/08/01?
No events or activities found for Grace from 8:00 PM to 9:30 PM on 2022/08/01. | Stand-up comedy show => 0.0
[query_llm] Alice's activities between 6:00 PM and 9:00 PM on 2022/07/28
Alice's activities between 6:00 PM and 9:00 PM on 2022/07/28 are not available in the retrieved data. | Fashion Show => 0.0
[query_llm] Willow's attendance at Summer camp informational session on 2022/04/02
Willow's attendance at the Summer camp informational session on 2022/04/02 is not mentioned in the retrieved data. | 1:30:00 => 0.0
[query_llm] Activities of Daisy from 11:00 PM on 2022/01/31 to 1:00 AM on 2022/02/01
No activities found for Daisy from 11:00 PM on 2022/01/31 to 1:00 AM on 2022/02/01. | Late

Processing examples:   2%|▏         | 1/60 [00:31<31:01, 31.55s/it]

[query_llm] Retrieve activities for Thomas from 8:00 PM to 10:00 PM on 2022/10/03.
[query_llm] Attendees of Art exhibit opening reception between 5:00 PM and 7:00 PM on 2022/08/01 at Locust Projects, Design District
No specific attendees were mentioned for the Art exhibit opening reception on 2022/08/01 at Locust Projects, Design District. | Sophia => 0.0
Thomas had no scheduled activities from 8:00 PM to 10:00 PM on 2022/10/03. | Star-gazing => 0.0
[query_llm] Georgia's schedule on 2022/06/22
[query_llm] Felix's schedule on 2022/03/30 for Live piano performance
[query_llm] Emily's schedule on 2022/06/15
Felix attended the live piano performance on March 30th, 2022, from 7:00 PM to 9:00 PM at The Piano Bar. | 7:00 PM => 1.0
Emily's attendance at the Jazz concert on 2022/06/15 is not found in the retrieved data. | 8:00 PM => 0.0
[query_llm] Georgia's Move-in day schedule on 2022/06/22
[query_llm] Attendees of Aquarium visit between 10:00 AM and 5:00 PM on 2022/07/06 at New England Aquar

Processing examples:   7%|▋         | 4/60 [00:57<12:04, 12.95s/it]

[query_llm] Activities of Adam from 7:00 PM to 9:00 PM on 2022/10/27
No relevant activities found for Adam from 7:00 PM to 9:00 PM on 2022/10/27. | Join a public speaking class => 0.0


Processing examples:   8%|▊         | 5/60 [00:58<08:53,  9.70s/it]

[query_llm] Find the location of the Floral design session attended by Emily on 2022/09/30.
Garden Collage | Garden Collage => 1.0
[query_llm] Attendees of Poetry reading at Beyond Baroque between 7:00 PM and 9:00 PM on 2022/04/07
Answer: The attendees of the Poetry reading at Beyond Baroque on April 7th, 2022, were not explicitly listed in the retrieved information. | Charlotte => 0.0
[query_llm] Attendees of Star Gazing event between 8:00 PM and 11:00 PM on 2022/08/23 at Observatory, Griffith Park Observatory
Adrian and Poppy attended the Star Gazing event between 8:00 PM and 11:00 PM on 2022/08/23 at Griffith Park Observatory. | Adrian => 0.5


Processing examples:  18%|█▊        | 11/60 [00:59<02:25,  2.96s/it]

[query_llm] Isabella's activities between 2:00 PM and 3:00 PM on 2022/12/03
Isabella's activities between 2:00 PM and 3:00 PM on 2022/12/03 are not listed. No specific events found for this timeframe. | Ballet class => 0.0
[query_llm] Daisy's attendance duration for Stand-up paddleboarding lesson on 2022/10/19
The information retrieved does not include Daisy's attendance on 2022/10/19. Please provide more specific details or check the agenda for that date. | 1:00:00 => 0.0
[query_llm] Retrieve the duration of Daniel's Dance Class attendance on 2022/06/24.
Answer: Daniel's Dance Class duration on 2022/06/24 is not available in the retrieved data. | 1:00:00 => 0.0
[query_llm] Fetch the duration of Aurora's attendance at the Dreamcatcher-making workshop on 2022/01/24.
Answer: The duration of Aurora's attendance at the Dreamcatcher-making workshop on 2022/01/24 was 2 hours, from 3:00 PM to 5:00 PM. | 2:00:00 => 1.0
[query_llm] Georgina's events on 2022/04/30
[query_llm] What time did Aidan

Processing examples:  25%|██▌       | 15/60 [01:04<01:42,  2.27s/it]

Brian did not attend a Car maintenance check on 2022/06/30. | 9:00 AM => 0.0
Faith did not attend an Ice hockey game on 2022/10/04 as per the available schedule. | 7:00 PM => 0.0
[query_llm] Find location of Martial arts class Lily attended on 2022/10/11
The Martial Arts class that Lily attended on 2022/10/11 took place at The Martial Arts Academy. | The Martial Arts Academy => 1.0
[query_llm] Find the location of the Chess club meeting that Victoria attended on 2022/06/13.
The Chess club meeting that Victoria attended on 2022/06/13 took place at The Chess Forum. | The Chess Forum => 1.0
[query_llm] Find the time Albert attended the Youth talent showcase on 2022/08/11.
Answer: Albert attended the Youth Talent Showcase on 2022/08/11 from 5:00 PM to 7:00 PM. | 5:00 PM => 1.0
[query_llm] Retrieve duration of Henry's attendance at Chili cook-off on 2022/03/21
Henry attended the Chili cook-off for 3 hours on 2022/03/21. | 3:00:00 => 1.0
[query_llm] Matilda's activities between 7:30 PM and 9

Processing examples: 100%|██████████| 60/60 [01:05<00:00,  1.10s/it]

[query_llm] Georgina's events on 2022/04/30 including Indie film festival
Georgina attended the Indie film festival on April 30th, 2022, from 12:00 PM to 10:00 PM at the Los Angeles Film Festival. | 12:00 PM => 1.0
[query_llm] Find the location of the Fun run event that Zara attended on 2022/02/19.
The location of the Fun run event that Zara attended on 2022/02/19 is not available in the retrieved data. | The Fun Run Trail => 0.0
Processing batch 3 of 4...



Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] Events for Phoebe on 2022/01/10
Phoebe did not attend a Food festival on 2022/01/10. | 6:00 PM => 0.0
[query_llm] Fetch the duration of Aurora's attendance at the Dreamcatcher-making workshop on 2022/01/24.
Answer: The duration of Aurora's attendance at the Dreamcatcher-making workshop on 2022/01/24 was 2 hours, from 3:00 PM to 5:00 PM. | 2:00:00 => 1.0
[query_llm] Activities of Daisy from 11:00 PM on 2022/01/31 to 1:00 AM on 2022/02/01
No activities found for Daisy from 11:00 PM on 2022/01/31 to 1:00 AM on 2022/02/01. | Late Night Movie => 0.0
[query_llm] Retrieve duration of Henry's attendance at Chili cook-off on 2022/03/21
Henry attended the Chili cook-off for 3 hours on 2022/03/21. | 3:00:00 => 1.0
[query_llm] Events for Charlotte from 4:00 PM to 5:30 PM on 2022/10/31
No relevant events were found for Charlotte from 4:00 PM to 5:30 PM on 2022/10/31. | Cooking Class => 0.0
[query_llm] Attendees of Art exhibit opening reception between 5:00 PM and 7:00 PM on 2022/08/01 a

Processing examples:   2%|▏         | 1/60 [00:36<35:37, 36.23s/it]

[query_llm] Millie's attendance duration at Theatre play on 2022/12/07
Millie attended the Theatre play for 2 hours on 2022/12/07. | 2:00:00 => 1.0
[query_llm] Retrieve activities for Thomas from 8:00 PM to 10:00 PM on 2022/10/03.
[query_llm] Willow's schedule on 2022/09/26
Thomas had no scheduled activities from 8:00 PM to 10:00 PM on 2022/10/03. | Star-gazing => 0.0
Unable to find relevant information for Willow attending Intermediate Spanish conversation class on 2022/09/26. Please provide more specific details or check if the schedule is available. | 11:00 AM => 0.0


Processing examples:   3%|▎         | 2/60 [00:36<14:48, 15.32s/it]

[query_llm] Find location of Movie Night attended by Hector on 2022/06/03.
The Movie Night that Hector attended on 2022/06/03 took place on the rooftop of The Standard Hotel. | The Standard Hotel rooftop  => 1.0
[query_llm] Robotics seminar location for Esme on 2022/09/13
University Auditorium | University Auditorium => 1.0
[query_llm] What is the duration of Charles's 'Watch a baseball game' event on 2022/05/21?
3 hours | 3:00:00 => 1.0
[query_llm] Attendees of Wine Tasting at City Winery between 5:00 PM and 7:00 PM on 2022/05/23
No specific attendees were mentioned for the Wine Tasting event at City Winery between 5:00 PM and 7:00 PM on 2022/05/23. | Penelope => 0.0
[query_llm] Attendees of Aquarium visit between 10:00 AM and 5:00 PM on 2022/07/06 at New England Aquarium
Georgina attended the Aquarium visit at the New England Aquarium between 10:00 AM and 5:00 PM on 2022/07/06. | Georgina => 1.0
[query_llm] James' activities from 9:00 AM to 11:00 AM on 2022/02/02
No relevant activiti

Processing examples:   7%|▋         | 4/60 [00:45<07:44,  8.29s/it]

[query_llm] Activities of Adam from 7:00 PM to 9:00 PM on 2022/10/27
No relevant activities found for Adam from 7:00 PM to 9:00 PM on 2022/10/27. | Join a public speaking class => 0.0
[query_llm] Retrieve Anthony's activities from 8:00 AM to 10:00 AM on 2022/09/04.
Anthony has no recorded activities from 8:00 AM to 10:00 AM on 2022/09/04. | Classic Car Show => 0.0
[query_llm] Faith's schedule on 2022/10/04
Faith did not attend an Ice hockey game on 2022/10/04 as per the available schedule. | 7:00 PM => 0.0
[query_llm] Daisy's attendance duration for Stand-up paddleboarding lesson on 2022/10/19
The information retrieved does not include Daisy's attendance on 2022/10/19. Please provide more specific details or check the agenda for that date. | 1:00:00 => 0.0
[query_llm] Daisy's attendance details for Jazz Performance on 2022/02/05
Daisy attended the Jazz Performance on 2022/02/05 for 3 hours, from 8:00 PM to 11:00 PM. | 3:00:00 => 1.0
[query_llm] Felix's schedule on 2022/03/30 for Live p

Processing examples: 100%|██████████| 60/60 [01:07<00:00,  1.13s/it]

Georgia attended Move-in day on 2022/06/22 from 9:00 AM to 5:00 PM. | 9:00 AM => 1.0
[query_llm] Find the location of the Fun run event that Zara attended on 2022/02/19.
The location of the Fun run event that Zara attended on 2022/02/19 is not available in the retrieved data. | The Fun Run Trail => 0.0
Processing batch 4 of 4...



Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] Alice's activities between 6:00 PM and 9:00 PM on 2022/07/28
Alice's activities between 6:00 PM and 9:00 PM on 2022/07/28 are not available in the retrieved data. | Fashion Show => 0.0
[query_llm] Faith's schedule on 2022/10/04
Faith did not attend an Ice hockey game on 2022/10/04 as per the available schedule. | 7:00 PM => 0.0
[query_llm] Attendees of Art exhibit opening reception between 5:00 PM and 7:00 PM on 2022/08/01 at Locust Projects, Design District
No specific attendees were mentioned for the Art exhibit opening reception on 2022/08/01 at Locust Projects, Design District. | Sophia => 0.0
[query_llm] Fetch Andrew's activities between 6:00 PM and 8:00 PM on 2022/08/23
[query_llm] Brian's activities on 2022/06/20, specifically focusing on the duration of Happy Hour attendance.
No relevant activities found for Andrew between 6:00 PM and 8:00 PM on 2022/08/23. | Fashion show => 0.0
[query_llm] Georgina's events on 2022/04/30
Brian attended Happy Hour for 2 hours on 202

Processing examples:   2%|▏         | 1/60 [00:32<31:44, 32.27s/it]

[query_llm] Patrick's attendance at Landscape photography workshop on 2022/02/22
Patrick attended the Landscape photography workshop on February 22, 2022, from 8:00 AM to 11:00 AM at Central Park. | 8:00 AM => 1.0
[query_llm] Retrieve activities for Thomas from 8:00 PM to 10:00 PM on 2022/10/03.
Thomas had no scheduled activities from 8:00 PM to 10:00 PM on 2022/10/03. | Star-gazing => 0.0
[query_llm] Events for Emily from 7:00 PM to 9:30 PM on 2022/02/07
No relevant events found for Emily from 7:00 PM to 9:30 PM on 2022/02/07. | Board game night => 0.0
[query_llm] James' activities from 9:00 AM to 11:00 AM on 2022/02/02
No relevant activities found for James from 9:00 AM to 11:00 AM on 2022/02/02. | Business meeting => 0.0
[query_llm] Find the location of the Chess club meeting that Victoria attended on 2022/06/13.
The Chess club meeting that Victoria attended on 2022/06/13 took place at The Chess Forum. | The Chess Forum => 1.0


Processing examples:   3%|▎         | 2/60 [00:34<14:18, 14.80s/it]

[query_llm] Find location of Movie Night attended by Hector on 2022/06/03.
The Movie Night that Hector attended on 2022/06/03 took place on the rooftop of The Standard Hotel. | The Standard Hotel rooftop  => 1.0
[query_llm] Retrieve the duration of George's Tennis lesson on 2022/06/22.
George's Tennis lesson on 2022/06/22 was not found in the retrieved data. Please provide more specific details or check the input data for accuracy. | 1:00:00 => 0.0
[query_llm] William's schedule on 2022/01/17
William did not attend a Music concert on 2022/01/17. | 8:00 PM => 0.0
[query_llm] Georgia's schedule on 2022/06/22
[query_llm] Emily's schedule on 2022/06/15
Emily's attendance at the Jazz concert on 2022/06/15 is not found in the retrieved data. | 8:00 PM => 0.0
[query_llm] Activities of Daisy from 11:00 PM on 2022/01/31 to 1:00 AM on 2022/02/01
No activities found for Daisy from 11:00 PM on 2022/01/31 to 1:00 AM on 2022/02/01. | Late Night Movie => 0.0
[query_llm] Isabella's schedule on 2022/05

Processing examples:   7%|▋         | 4/60 [00:59<12:30, 13.41s/it]

[query_llm] Activities of Adam from 7:00 PM to 9:00 PM on 2022/10/27
No relevant activities found for Adam from 7:00 PM to 9:00 PM on 2022/10/27. | Join a public speaking class => 0.0
[query_llm] Brian's schedule on 2022/06/30
Brian did not attend a Car maintenance check on 2022/06/30. | 9:00 AM => 0.0
[query_llm] Find the location of the Fun run event that Zara attended on 2022/02/19.
The location of the Fun run event that Zara attended on 2022/02/19 is not available in the retrieved data. | The Fun Run Trail => 0.0
[query_llm] Willow's attendance at Summer camp informational session on 2022/04/02
Willow's attendance at the Summer camp informational session on 2022/04/02 is not mentioned in the retrieved data. | 1:30:00 => 0.0
[query_llm] Phillip's attendance at Creative Writing Workshop on 2022/06/28
Phillip attended the Creative Writing Workshop on June 28th, 2022, from 6:00 PM to 8:00 PM at The StoryStudio. | 6:00 PM => 1.0
[query_llm] Albert's attendance duration for Conference Ca

Processing examples:  18%|█▊        | 11/60 [01:02<02:53,  3.54s/it]

Daisy attended the Public Policy Conference for 8 hours on February 25, 2022. | 8:00:00 => 1.0
[query_llm] Retrieve Anthony's activities from 8:00 AM to 10:00 AM on 2022/09/04.
Anthony has no recorded activities from 8:00 AM to 10:00 AM on 2022/09/04. | Classic Car Show => 0.0
[query_llm] Isabella's activities between 2:00 PM and 3:00 PM on 2022/12/03
Isabella's activities between 2:00 PM and 3:00 PM on 2022/12/03 are not listed. No specific events found for this timeframe. | Ballet class => 0.0
[query_llm] Daisy's attendance details for Jazz Performance on 2022/02/05


Processing examples: 100%|██████████| 60/60 [01:02<00:00,  1.05s/it]

[query_llm] Georgina's events on 2022/04/30 including Indie film festival
Daisy attended the Jazz Performance on 2022/02/05 for 3 hours, from 8:00 PM to 11:00 PM. | 3:00:00 => 1.0
Georgina attended the Indie film festival on April 30th, 2022, from 12:00 PM to 10:00 PM at the Los Angeles Film Festival. | 12:00 PM => 1.0





0.4166666666666667