In [2]:
HF_USR_NAME = 'shirwu'
TOOL_QA_ROOT = '/dfs/project/kgrlm/shirwu/msr_intern/home/t-yingxinwu/msr_intern/ToolQA-rebuttal'

### Upload to Huggingface

In [2]:
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict

level = 'easy'
dataset = 'agenda'

dataset_dir = f'{dataset}-{level}.jsonl'
hf_dataset_name = f'toolqa_{dataset}_{level}'

df = pd.read_json(dataset_dir, lines=True)
df.head()

df['answer'] = df['answer'].apply(lambda x: str(x))
dataset = Dataset.from_pandas(df)

In [3]:
dataset_dict = DatasetDict({'train': dataset})
# push to hf for the ease for using dspy
dataset_dict.push_to_hub(repo_id=hf_dataset_name, private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/shirwu/toolqa_agenda_easy/commit/0df55d78535ff852f72cc05c36360590a612f5fa', commit_message='Upload dataset', commit_description='', oid='0df55d78535ff852f72cc05c36360590a612f5fa', pr_url=None, pr_revision=None, pr_num=None)

## Setting Up

* ToolQA

Before loading our datasets and going to the execution part, we'll need to configure the `lm` in `dspy.settings`. For the purpose of this notebook we'll be using `gpt-4o`.

In [4]:
import os
import dspy
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)


dspy.settings.configure(
    lm=dspy.OpenAI(
        model="gpt-4o",
        api_key=os.getenv("OPENAI_API_KEY"),
        max_tokens=2048,
        temperature=0.6
    )
)

## Defining Signature

In [5]:
class ToolQASignature(dspy.Signature):
    """You will be given a question. Your task is to answer the question with a short response. For example, (1) Question: \'How many dates in the agenda table have Alexander scheduled?\' => Answer: \'73\' (2) Question: \'What events does Jade have on 2022/01/25 in the agenda table?\' => Answer: \'Kids concert\'
    """
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
        format=lambda x: x.strip(),
    )
    answer: str = dspy.OutputField(
        prefix="Answer:",
        desc="answer to the question",
    )


## Loading Datasets

In [6]:
from random import sample
from dspy.datasets import DataLoader

dl = DataLoader()

In [7]:
tool_qa = dl.from_huggingface(
    f'{HF_USR_NAME}/' + hf_dataset_name,
    split="train",
    input_keys=("question", "answer"),
)

Downloading readme:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.37k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
len(tool_qa)

100

In [9]:
import random
# set seed
random.seed(42)

train_idx = random.sample(range(len(tool_qa)), 40)
remaining_idx = list(set(range(len(tool_qa))) - set(train_idx))
test_idx = random.sample(remaining_idx, 60)

toolqa_train = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in train_idx]
]
toolqa_test = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in test_idx]
]

## Setting Up Tools

We'll setup `Avatar` modules for both signatures and all the `tools` can be used by each of the dataset. `Tool` is a pydantic model that Avatar expects the `tools` to be composed as more specifically it have 4 fields:

* `name` : Name of the tool
* `input_type` : Type of input the tool accepts
* `output_type` : Type of output the tool returns
* `tool` : The actual tool object

In [10]:
import os
import time
import uuid
import numpy as np
import jsonlines
from concurrent.futures import ProcessPoolExecutor
import sentence_transformers
import chromadb
import os.path as osp
from chromadb.config import Settings

EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CHROMA_PERSIST_DIRECTORY = osp.join(TOOL_QA_ROOT, "data/chroma_db/agenda")
CHROMA_COLLECTION_NAME = "all"
CHROMA_SERVER_HOST = "localhost"
CHROMA_SERVER_HTTP_PORT = "8000"
FILE_PATH = osp.join(TOOL_QA_ROOT, "data/external_corpus/agenda/agenda_descriptions_merged.jsonl")

def sentence_embedding(model, texts):
    embeddings = model.encode(texts)
    return embeddings

def create_chroma_db(chroma_server_host, chroma_server_http_port, collection_name):
    chroma_client = chromadb.Client(Settings(
        chroma_api_impl="rest",
        chroma_server_host=chroma_server_host,
        chroma_server_http_port=chroma_server_http_port,
    ))
    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def create_chroma_db_local(persist_directory, collection_name):
    
    chroma_client = chromadb.PersistentClient(path=persist_directory)

    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def insert_to_db(texts, model_name, cuda_idx, db):
    model = sentence_transformers.SentenceTransformer(model_name, device=f"cuda:{cuda_idx}")

    batch_embeddings = []
    batch_texts = []
    start_time = time.time()
    print(f"Total Articles to process: {len(texts)}, Current Thread: {cuda_idx}.")
    for i, text in enumerate(texts):
        # 2. generate embedding
        embeddings = sentence_embedding(model, text).tolist()

        batch_embeddings.append(embeddings)
        batch_texts.append(text)
        # 3. add to vectorstore per 500 articles or last article
        if i % 100 == 0 or i == len(texts)-1:
            batch_ids = [str(uuid.uuid1()) for _ in batch_texts]
            db.add(
                embeddings=batch_embeddings,
                documents=batch_texts,
                ids = batch_ids
            )
            batch_embeddings = []
            batch_texts = []
            print(f"Completed Processing article count: {i}, Current Thread: {cuda_idx}, Time took: {time.time() - start_time}.")
    print(f"Thread {cuda_idx} Completed. Total time took for thread: {time.time() - start_time}.")


# Multi-processing
def query_llm(query, is_local=True, start=None, end=None):
    EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
    CHROMA_PERSIST_DIRECTORY = osp.join(TOOL_QA_ROOT, "data/chroma_db/agenda")
    os.makedirs(CHROMA_PERSIST_DIRECTORY, exist_ok=True)
    CHROMA_COLLECTION_NAME = "all"
    CHROMA_SERVER_HOST = "localhost"
    CHROMA_SERVER_HTTP_PORT = "8000"
    FILE_PATH = osp.join(TOOL_QA_ROOT, "data/external_corpus/agenda/agenda_descriptions_merged.jsonl")

    cuda_idxes = [0]
    number_of_processes = len(cuda_idxes)
    input_texts = []
    db = create_chroma_db_local(CHROMA_PERSIST_DIRECTORY, CHROMA_COLLECTION_NAME)
    with open(FILE_PATH, 'r') as f:
        for item in jsonlines.Reader(f):
            input_texts.append(item["event"])
    # input_texts = np.array_split(input_texts, number_of_processes)

    args = ((input_texts[i], EMBED_MODEL_NAME, cuda_idxes[i], is_local) for i in range(number_of_processes))

    # if there is no file under the directory "/localscratch/yzhuang43/ra-llm/retrieval_benchmark/data/chroma_db/agenda", insert the data into the db
    if len(os.listdir(CHROMA_PERSIST_DIRECTORY)) == 0:
        insert_to_db(input_texts, model_name=EMBED_MODEL_NAME, cuda_idx=0, db=db)

    input_paths = np.array_split(input_texts, number_of_processes)
    with ProcessPoolExecutor(number_of_processes) as executor:
        executor.map(insert_to_db, args)
    model = sentence_transformers.SentenceTransformer(EMBED_MODEL_NAME, device=f"cuda:0")
    query_embedding = sentence_embedding(model, query).tolist()
    results = db.query(query_embeddings=query_embedding, n_results=5)
    retrieval_content = [result for result in results['documents'][0]]
    # print(retrieval_content)
    retrieval_content = '\n\n'.join(retrieval_content)
    print('[query_llm]', query)
    return retrieval_content

query = "What is the Jessica's agenda on March 7th, 2023?"
print(query_llm(query))

[query_llm] What is the Jessica's genda on March 7th, 2023?
Jessica has a job interview scheduled for July 26th, 2022 at 1 PM. The interview will take place at the Corporate Office Building, and is scheduled to end at 2 PM.

Jessica has a business meeting on January 22nd, 2022 at The Pacific Design Center. The meeting is scheduled to start at 3:00 PM and end at 4:00 PM.

Jessica will be attending a Late Night Movie on January 13, 2022, at a movie theater. The movie will start at 11:00 PM and end at 1:00 AM. She is looking forward to enjoying some popcorn and her favorite movie as she winds down her day.

Jessica is participating in a charity walk happening on April 22nd, 2022. The event is scheduled to start at 9:00 AM and will end at 11:00 AM. The walk will take place in Lincoln Park, and it's an opportunity for participants to raise money for a charitable organization while enjoying a morning walk. Jessica is sure to have a great time giving back to her community and supporting a goo

In [11]:
from dspy.predict.avatar import Tool, Avatar
from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper
from langchain.tools import BaseTool, StructuredTool, tool

def RETRIEVE(query: str) -> str:
    """If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."""
    return query_llm(query)

tools = [
    Tool(
        tool=StructuredTool.from_function(RETRIEVE),
        name="RETRIEVE",
        desc="If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."
    )
]

Once we have defined our `tools`, we can now create an `Avatar` object by passing the `tools` and `signature`. It takes 2 more optional parameters `verbose` and `max_iters`. `verbose` is used to display the logs and `max_iters` is used to control the number of iterations in multi step execution. 

An avatar agent stops the tool usage iteration once it reaches `max_iters` or when it prompts `Finish`. You can also create custom tools too, all you need to make sure is:

* You pass is a class object.
* Implements `__init__` and `run` method.
* Must take 1 string a input and returns 1 string as output.

If your tool doesn't return or takes input a string then you can make a custom wrapper to take care of that for now. In future we'll try to enable a diverse tool usage.

In [12]:
actor_agent = Avatar(
    tools=tools,
    signature=ToolQASignature,
    verbose=False,
    max_iters=10
)

Please use standard predictors, e.g. dspy.Predict and dspy.ChainOfThought.
They now support type annotations and other features of TypedPredictors and tend to work much better out of the box.
Please let us know if you face any issues: https://github.com/stanfordnlp/dspy/issues


## Evaluation

Open enden QA tasks are hard to evaluate on rigid metrics like exact match. So, we'll be using an improvised LLM as Judge for the evaluation of our model on test set.

In [13]:
class Evaluator(dspy.Signature):
    """Please act as an impartial judge to evaluate whether the answer is correct based on the ground truth answer"""
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
    )
    reference_answer: str = dspy.InputField(
        prefix="Ground Truth Answer:",
        desc="Ground truth answer to the question.",
    )
    answer: str = dspy.InputField(
        prefix="Answer:",
        desc="Answer to the question given by the model.",
    )
    rationale: str = dspy.OutputField(
        prefix="Rationale:",
        desc="Explanation of why the answer is correct or incorrect.",
    )
    is_correct: float = dspy.OutputField(
        prefix="Correct:",
        desc="Whether the answer is correct. Give 0 if incorrect, 1 if correct, (0, 1) if partially correct.",
    )


evaluator = dspy.TypedPredictor(Evaluator)


def metric(example, prediction, trace=None):
    acc = float(
        evaluator(
            question=example.question,
            answer=prediction.answer,
            reference_answer=example.answer
        ).is_correct
    ) 
    print(prediction.answer, '|', example.answer, '=>', acc)
    return acc

In [14]:
print(toolqa_train[0])
metric(toolqa_train[0], prediction=dspy.Example(answer='73 days'))

 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb


Example({'question': 'What did Georgina do from 1:00 PM to 11:00 PM on 2022/05/18?', 'answer': 'Oktoberfest'}) (input_keys={'paper_id', 'question'})
73 days | Oktoberfest => 0.0


0.0

For evaluation we can't use `dspy.Evaluate`, reason being that `Avatar` changes it's signature per iteration by adding the actions and it's results to it as fields. So we can create our own hacky thread safe evaluator for it.

In [15]:
import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "False"

from concurrent.futures import ThreadPoolExecutor

def process_example(example, signature):
    try:
        avatar = Avatar(
            signature,
            tools=tools,
            verbose=False,
            max_iters=10
        )
        prediction = avatar(**example.inputs().toDict())

        return metric(example, prediction)
    except Exception as e:
        print(e)
        return 0

# process_example(tool_qa[0], ToolQASignature)
def multi_thread_executor(test_set, signature, num_threads=60):
    total_score = 0
    total_examples = len(test_set)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_example, example, signature) for example in test_set]

        for future in tqdm.tqdm(futures, total=total_examples, desc="Processing examples"):
            total_score += future.result()

    avg_metric = total_score / total_examples
    return avg_metric

def single_thread_executor(test_set, signature):
    total_score = 0
    total_examples = len(test_set)

    for example in tqdm.tqdm(test_set, desc="Processing examples"):
        total_score += process_example(example, signature)

    avg_metric = total_score / total_examples
    return avg_metric

## Optimization

For the optimization of the `Actor` we'll be using `AvatarOptimizer`. It's a DSPy implementation of the [Avatar](https://github.com/zou-group/avatar/) method that optimizes the `Actor` for the given `tools` using a comparator module that optimizes Actor instruction. Note, that Actor is the Module that directs tool execution and flow, it's not the signature that we are passing. It doesn't optimize the instruction of the signature we pass. It takes the following parameters:

* `metric`: Metric that we'll be optimizing for
* `max_iters`: Maximum number of iterations for the optimizer
* `lower_bound`: Lower bound for the metric to classify example as negative
* `upper_bound`: Upper bound for the metric to classify example as positive
* `max_positive_inputs`: Maximum number of positive inputs sampled for comparator
* `max_negative_inputs`: Maximum number of negative inputs sampled for comparator
* `optimize_for`: Whether we want to maximize the metric or minimize it during optimization

Once the optimizer is done we can get the optimized actor and use it for the evaluation.

In [16]:
from dspy.teleprompt import AvatarOptimizer

teleprompter = AvatarOptimizer(
    metric=metric,
    max_iters=1,
    max_negative_inputs=20,
    max_positive_inputs=20,
)

In [17]:
optimized_actor_agent = teleprompter.compile(
    student=actor_agent,
    trainset=toolqa_train
)

Iteration 1/1


Processing examples:   0%|          | 0/40 [00:00<?, ?it/s]

[query_llm] Millie's schedule on 2022/06/11
[query_llm] Culinary festival location for Adrian on 2022/09/27
[query_llm] Where did Morning Swim that Matilda attended take place on 2022/07/24?
[query_llm] Thomas attendance records for Artisanal Pizza Making on 2022/10/28
[query_llm] What did Aurora do from 2:00 PM to 3:00 PM on 2022/02/21?
[query_llm] Natalie's attendance duration for Volunteer work on 2022/03/01
[query_llm] Elsie Speed dating event 2022/10/12
[query_llm] Georgia Escape Room 2022/06/24
[query_llm] Anthony's attendance duration at Volleyball Tournament on 2022/11/13
[query_llm] Isabella's schedule on 2022/12/25 from 2:00 PM to 4:00 PM
[query_llm] Charlotte's attendance duration for Piano Recital on 2022/01/30
LA Fitness Aquatic Center | LA Fitness Aquatic Center  => 1.0
Millie did not attend a Wine education seminar on 2022/06/11 according to the retrieved schedule. | 6:00 PM => 0.0
The Culinary Festival that Adrian attended on 2022/09/27 took place at The Convention Cent

Processing examples:   2%|▎         | 1/40 [01:01<39:58, 61.50s/it]

Georgina did not have any events scheduled from 1:00 PM to 11:00 PM on 2022/05/18. | Oktoberfest => 0.0
Layla attended the pottery throwing class on April 24th, 2022, from 10:00 AM to 12:00 PM at The Potter's Wheel. | 10:00 AM => 1.0
Ava did not have any scheduled events from 9:00 AM to 5:00 PM on 2022/06/07 based on the retrieved information. | Industry conference => 0.0


Processing examples: 100%|██████████| 40/40 [01:08<00:00,  1.72s/it]

The information provided does not specify who attended the boxing match at Madison Square Garden on November 6, 2022. | Joseph => 0.0
Average Score: 0.6125
Positive examples: 24
Negative examples: 15
Sampling 20 positive examples and 20 negative examples





Generated new instruction: New Instruction: 

To effectively accomplish the `Goal` using the provided `Tools`, it is essential to apply a structured approach that leverages both existing capabilities and enhanced features for improved performance. Begin by thoroughly analyzing the user query to identify key elements such as event, location, time, and participants. Utilize a more sophisticated NLP model to understand the context and relationships within the query, especially focusing on identifying participants and cross-referencing details. This will ensure that the retrieval logic extracts relevant information from the agenda accurately, addressing the specific needs of the query.

Implement a query refinement process as a pre-processing step. Break down the user query into more specific sub-queries that can be effectively processed by the `RETRIEVE` tool. This step is critical for handling ambiguous queries, ensuring that the tool is used multiple times if necessary, with different i

Now we can evaluate our actor module, for this we've provided an implementation of thread safe evaluator that we above as part of class method of `AvatarOptimizer`.

In [18]:
teleprompter.thread_safe_evaluator(toolqa_test, actor_agent)

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] Phoebe's attendance at the Food festival on 2022/01/10
[query_llm] What did Daisy do from 11:00 PM to 1:00 AM on 2022/02/01?
[query_llm] Willow's attendance duration for Summer camp informational session on 2022/04/02
[query_llm] Where did Movie Night that Hector attended take place on 2022/06/03?
[query_llm] Georgia's events on 2022/06/22
[query_llm] What did Adam do from 7:00 PM to 9:00 PM on 2022/10/27?
[query_llm] Faith's attendance at the Ice hockey game on 2022/10/04
[query_llm] Patrick's events on 2022/02/22, specifically looking for Landscape photography workshop attendance
[query_llm] Brian's attendance duration at Happy Hour on 2022/06/20
[query_llm] What did Alice do from 6:00 PM to 9:00 PM on 2022/07/28?
[query_llm] Who attended Star Gazing between 8:00 PM and 11:00 PM on 2022/08/23 in Observatory, Griffith Park Observatory?
[query_llm] Albert's schedule for 2022/08/11
[query_llm] Benjamin's attendance at Movie night on 2022/10/21
[query_llm] Historical Tour loc

Processing examples:   2%|▏         | 1/60 [00:53<52:46, 53.67s/it]

Thomas did not have any scheduled events from 8:00 PM to 10:00 PM on 2022/10/03. | Star-gazing => 0.0
The Chess club meeting that Victoria attended on 2022/06/13 took place at The Chess Forum. | The Chess Forum => 1.0
There is no information available for what Grace did from 8:00 PM to 9:30 PM on 2022/08/01 in the provided data. | Stand-up comedy show => 0.0
Emily attended the Jazz concert on June 15, 2022, at The Jazz Lounge from 8:00 PM to 10:00 PM. | 8:00 PM => 1.0
Willow attended the Intermediate Spanish conversation class on 2022/09/26 from 11:00 AM to 1:00 PM. | 11:00 AM => 1.0
There is no record of a Wine Tasting event at City Winery on 2022/05/23 between 5:00 PM and 7:00 PM. | Penelope => 0.0
The available data does not mention any specific attendees for the Wine and Paint Event on April 10th, 2022, at the Art Studio Downtown. | Benjamin => 0.0
[query_llm] Daniel's Dance Class attendance duration on 2022/06/24
[query_llm] Charles' attendance duration for 'Watch a baseball game'

Processing examples:  10%|█         | 6/60 [01:12<08:51,  9.85s/it]

Georgia attended Move-in day on 2022/06/22 from 9:00 AM to 5:00 PM. | 9:00 AM => 0.5
[query_llm] Emily's activities on February 7th, 2022 between 7:00 PM and 9:30 PM
Phillip attended the Creative writing workshop on June 28th, 2022, from 6:00 PM to 8:00 PM. | 6:00 PM => 1.0
The Fun Run Trail | The Fun Run Trail => 1.0


Processing examples:  18%|█▊        | 11/60 [01:14<03:47,  4.64s/it]

There is no information available about Isabella's activities from 2:00 PM to 3:00 PM on 2022/12/03. | Ballet class => 0.0
The Martial Arts class that Lily attended on 2022/10/11 took place at The Martial Arts Academy. | The Martial Arts Academy => 1.0


Processing examples: 100%|██████████| 60/60 [01:15<00:00,  1.26s/it]

Emily's activities from 7:00 PM to 9:30 PM on 2022/02/07 are not available in the provided data. | Board game night => 0.0





0.6

## ReAct

In [22]:
from langchain_core.tools import Tool
from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper


tools = [
    Tool(
        func=StructuredTool.from_function(RETRIEVE),
        name="RETRIEVE",
        description="If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."
    ),
    Tool(
        func=GoogleSerperAPIWrapper().run,
        name="WEB_SEARCH",
        description="If you have a question, you can use this tool to search the web for the answer."
    )
]

In [20]:
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent, load_tools
from langchain_openai import ChatOpenAI
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_core.tools import Tool

llm = ChatOpenAI(
    model_name="gpt-4o",
    temperature=0.0
)
prompt = hub.pull("hwchase17/react")

agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False, handle_parsing_errors=True)

def metric(example, prediction, trace=None):
    return int(
        evaluator(
            question=example.question,
            answer=prediction["output"],
            reference_answer=example.answer
        ).is_correct
    )

def get_metric_value(example):
    prediction = agent_executor.invoke(
        {
            "input": example.question
        }
    )
    return metric(example, prediction)

def process_example(example):
    try:
        return get_metric_value(example)
    except Exception as e:
        print(e)
        return 0

def multi_thread_executor(test_set, num_threads=100):
    total_score = 0
    total_examples = len(test_set)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_example, example) for example in test_set]

        for future in tqdm.tqdm(futures, total=total_examples, desc="Processing examples"):
            total_score += future.result()

    avg_metric = total_score / total_examples
    return avg_metric

In [21]:
multi_thread_executor(toolqa_test)

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] When did Benjamin attend Movie night on 2022/10/21?
[query_llm] What is Emily's agenda on February 7th, 2022?
[query_llm] What is George's agenda on June 22, 2022?
[query_llm] What is Georgina's agenda on April 30th, 2022?
[query_llm] Who attended Poetry reading between 7:00 PM and 9:00 PM on 2022/04/07 in Beyond Baroque?
[query_llm] What is Alice's agenda on July 28th, 2022, from 6:00 PM to 9:00 PM?
[query_llm] What is Millie's agenda on December 7th, 2022?
[query_llm] What is Hector's agenda on June 3rd, 2022?
[query_llm] What is Isabella's agenda on December 3rd, 2022?
[query_llm] What is Georgia's agenda on June 22, 2022?
[query_llm] What is Georgina's agenda on April 18th, 2022?
[query_llm] When did Brian attend Car maintenance check on 2022/06/30?
[query_llm] What is Patrick's agenda on February 22, 2022?
[query_llm] What is Aurora's agenda on January 24th, 2022?
[query_llm] What is George's agenda on June 22, 2022?
[query_llm] What is Elizabeth's agenda on June 29th,

Processing examples:   2%|▏         | 1/60 [01:59<1:57:38, 119.64s/it]

[query_llm] What is Henry's agenda on March 21st, 2022, specifically regarding the Chili cook-off?
[query_llm] What is Esme's agenda on September 13th, 2022, specifically regarding the Robotics seminar?
[query_llm] What is Charlotte's agenda on October 31, 2022, from 4:00 PM to 5:30 PM?
[query_llm] What is Daisy's agenda on February 1st, 2022, from 11:00 PM to 1:00 AM?
[query_llm] What did Matilda do from 7:30 PM to 9:30 PM on March 25th, 2022?
[query_llm] What did Anthony do from 8:00 AM to 10:00 AM on September 4th, 2022?
[query_llm] What did Grace do from 8:00 PM to 9:30 PM on August 1st, 2022?
[query_llm] What is Charles's agenda for attending a baseball game on May 21, 2022?
[query_llm] What is Brian's agenda on June 20th, 2022?
[query_llm] What is Matilda's agenda on December 13th, 2022?
[query_llm] What is Daisy's agenda on February 5th, 2022?
[query_llm] What is Albert's agenda on April 24th, 2022?
[query_llm] What did Adam do from 7:00 PM to 9:00 PM on October 27, 2022?
[query

Processing examples:  38%|███▊      | 23/60 [02:11<01:40,  2.73s/it]  

[query_llm] What is Albert's agenda on April 24th, 2022?


Processing examples:  48%|████▊     | 29/60 [02:27<01:04,  2.09s/it]

[query_llm] What did Andrew do on August 23, 2022, from 6:00 PM to 8:00 PM?


Processing examples: 100%|██████████| 60/60 [02:56<00:00,  2.93s/it]


0.38333333333333336