In [2]:
HF_USR_NAME = 'shirwu'
TOOL_QA_ROOT = '/dfs/project/kgrlm/shirwu/msr_intern/home/t-yingxinwu/msr_intern/ToolQA-rebuttal'

### Upload to Huggingface

In [2]:
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict

level = 'hard'
dataset = 'agenda'

dataset_dir = f'{dataset}-{level}.jsonl'
hf_dataset_name = f'toolqa_{dataset}_{level}'

df = pd.read_json(dataset_dir, lines=True)
df.head()

df['answer'] = df['answer'].apply(lambda x: str(x))
dataset = Dataset.from_pandas(df)

In [3]:
dataset_dict = DatasetDict({'train': dataset})
# push to hf for the ease for using dspy
dataset_dict.push_to_hub(repo_id=hf_dataset_name, private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/shirwu/toolqa_agenda_hard/commit/7fda19a60f9bb7375bd2d417cdb1742d2af28869', commit_message='Upload dataset', commit_description='', oid='7fda19a60f9bb7375bd2d417cdb1742d2af28869', pr_url=None, pr_revision=None, pr_num=None)

## Setting Up

* ToolQA

Before loading our datasets and going to the execution part, we'll need to configure the `lm` in `dspy.settings`. For the purpose of this notebook we'll be using `gpt-4o`.

In [4]:
import os
import dspy
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)


dspy.settings.configure(
    lm=dspy.OpenAI(
        model="gpt-4o",
        api_key=os.getenv("OPENAI_API_KEY"),
        max_tokens=4000,
        temperature=0
    )
)

## Defining Signature

In [5]:
class ToolQASignature(dspy.Signature):
    """You will be given a question. Your task is to answer the question with a short response. For example, (1) Question: \'How many dates in the agenda table have Alexander scheduled?\' => Answer: \'73\' (2) Question: \'What events does Jade have on 2022/01/25 in the agenda table?\' => Answer: \'Kids concert\'
    """
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
        format=lambda x: x.strip(),
    )
    answer: str = dspy.OutputField(
        prefix="Answer:",
        desc="answer to the question",
    )


## Loading Datasets

In [6]:
from random import sample
from dspy.datasets import DataLoader

dl = DataLoader()

In [7]:
tool_qa = dl.from_huggingface(
    f'{HF_USR_NAME}/' + hf_dataset_name,
    split="train",
    input_keys=("question", "answer"),
)

Downloading readme:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.25k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
len(tool_qa)

100

In [9]:
import random
# set seed
random.seed(42)

train_idx = random.sample(range(len(tool_qa)), 40)
remaining_idx = list(set(range(len(tool_qa))) - set(train_idx))
test_idx = random.sample(remaining_idx, 60)

toolqa_train = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in train_idx]
]
toolqa_test = [
    dspy.Example(question=example.question, answer=example.answer).with_inputs("question", "paper_id")
    for example in [tool_qa[i] for i in test_idx]
]

## Setting Up Tools

We'll setup `Avatar` modules for both signatures and all the `tools` can be used by each of the dataset. `Tool` is a pydantic model that Avatar expects the `tools` to be composed as more specifically it have 4 fields:

* `name` : Name of the tool
* `input_type` : Type of input the tool accepts
* `output_type` : Type of output the tool returns
* `tool` : The actual tool object

In [10]:
import os
import time
import uuid
import numpy as np
import jsonlines
from concurrent.futures import ProcessPoolExecutor
import sentence_transformers
import chromadb
import os.path as osp
from chromadb.config import Settings

EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CHROMA_PERSIST_DIRECTORY = osp.join(TOOL_QA_ROOT, "data/chroma_db/agenda")
CHROMA_COLLECTION_NAME = "all"
CHROMA_SERVER_HOST = "localhost"
CHROMA_SERVER_HTTP_PORT = "8000"
FILE_PATH = osp.join(TOOL_QA_ROOT, "data/external_corpus/agenda/agenda_descriptions_merged.jsonl")

def sentence_embedding(model, texts):
    embeddings = model.encode(texts)
    return embeddings

def create_chroma_db(chroma_server_host, chroma_server_http_port, collection_name):
    chroma_client = chromadb.Client(Settings(
        chroma_api_impl="rest",
        chroma_server_host=chroma_server_host,
        chroma_server_http_port=chroma_server_http_port,
    ))
    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def create_chroma_db_local(persist_directory, collection_name):
    
    chroma_client = chromadb.PersistentClient(path=persist_directory)

    collection = chroma_client.get_or_create_collection(name=collection_name)
    return collection

def insert_to_db(texts, model_name, cuda_idx, db):
    model = sentence_transformers.SentenceTransformer(model_name, device=f"cuda:{cuda_idx}")

    batch_embeddings = []
    batch_texts = []
    start_time = time.time()
    print(f"Total Articles to process: {len(texts)}, Current Thread: {cuda_idx}.")
    for i, text in enumerate(texts):
        # 2. generate embedding
        embeddings = sentence_embedding(model, text).tolist()

        batch_embeddings.append(embeddings)
        batch_texts.append(text)
        # 3. add to vectorstore per 500 articles or last article
        if i % 100 == 0 or i == len(texts)-1:
            batch_ids = [str(uuid.uuid1()) for _ in batch_texts]
            db.add(
                embeddings=batch_embeddings,
                documents=batch_texts,
                ids = batch_ids
            )
            batch_embeddings = []
            batch_texts = []
            print(f"Completed Processing article count: {i}, Current Thread: {cuda_idx}, Time took: {time.time() - start_time}.")
    print(f"Thread {cuda_idx} Completed. Total time took for thread: {time.time() - start_time}.")


# Multi-processing
def query_llm(query, is_local=True, start=None, end=None):
    EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
    CHROMA_PERSIST_DIRECTORY = osp.join(TOOL_QA_ROOT, "data/chroma_db/agenda")
    os.makedirs(CHROMA_PERSIST_DIRECTORY, exist_ok=True)
    CHROMA_COLLECTION_NAME = "all"
    CHROMA_SERVER_HOST = "localhost"
    CHROMA_SERVER_HTTP_PORT = "8000"
    FILE_PATH = osp.join(TOOL_QA_ROOT, "data/external_corpus/agenda/agenda_descriptions_merged.jsonl")

    cuda_idxes = [0]
    number_of_processes = len(cuda_idxes)
    input_texts = []
    db = create_chroma_db_local(CHROMA_PERSIST_DIRECTORY, CHROMA_COLLECTION_NAME)
    with open(FILE_PATH, 'r') as f:
        for item in jsonlines.Reader(f):
            input_texts.append(item["event"])
    # input_texts = np.array_split(input_texts, number_of_processes)

    args = ((input_texts[i], EMBED_MODEL_NAME, cuda_idxes[i], is_local) for i in range(number_of_processes))

    # if there is no file under the directory "/localscratch/yzhuang43/ra-llm/retrieval_benchmark/data/chroma_db/agenda", insert the data into the db
    if len(os.listdir(CHROMA_PERSIST_DIRECTORY)) == 0:
        insert_to_db(input_texts, model_name=EMBED_MODEL_NAME, cuda_idx=0, db=db)

    input_paths = np.array_split(input_texts, number_of_processes)
    with ProcessPoolExecutor(number_of_processes) as executor:
        executor.map(insert_to_db, args)
    model = sentence_transformers.SentenceTransformer(EMBED_MODEL_NAME, device=f"cuda:0")
    query_embedding = sentence_embedding(model, query).tolist()
    results = db.query(query_embeddings=query_embedding, n_results=5)
    retrieval_content = [result for result in results['documents'][0]]
    # print(retrieval_content)
    retrieval_content = '\n\n'.join(retrieval_content)
    print('[query_llm]', query)
    return retrieval_content

query = "What is the Jessica's genda on March 7th, 2023?"
print(query_llm(query))

[query_llm] What is the Jessica's genda on March 7th, 2023?
Jessica has a job interview scheduled for July 26th, 2022 at 1 PM. The interview will take place at the Corporate Office Building, and is scheduled to end at 2 PM.

Jessica has a business meeting on January 22nd, 2022 at The Pacific Design Center. The meeting is scheduled to start at 3:00 PM and end at 4:00 PM.

Jessica will be attending a Late Night Movie on January 13, 2022, at a movie theater. The movie will start at 11:00 PM and end at 1:00 AM. She is looking forward to enjoying some popcorn and her favorite movie as she winds down her day.

Jessica is participating in a charity walk happening on April 22nd, 2022. The event is scheduled to start at 9:00 AM and will end at 11:00 AM. The walk will take place in Lincoln Park, and it's an opportunity for participants to raise money for a charitable organization while enjoying a morning walk. Jessica is sure to have a great time giving back to her community and supporting a goo

In [11]:
from dspy.predict.avatar import Tool, Avatar
from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper
from langchain.tools import BaseTool, StructuredTool, tool

def RETRIEVE(query: str) -> str:
    """If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."""
    return query_llm(query)

tools = [
    Tool(
        tool=StructuredTool.from_function(RETRIEVE),
        name="RETRIEVE",
        desc="If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."
    )
]

Once we have defined our `tools`, we can now create an `Avatar` object by passing the `tools` and `signature`. It takes 2 more optional parameters `verbose` and `max_iters`. `verbose` is used to display the logs and `max_iters` is used to control the number of iterations in multi step execution. 

An avatar agent stops the tool usage iteration once it reaches `max_iters` or when it prompts `Finish`. You can also create custom tools too, all you need to make sure is:

* You pass is a class object.
* Implements `__init__` and `run` method.
* Must take 1 string a input and returns 1 string as output.

If your tool doesn't return or takes input a string then you can make a custom wrapper to take care of that for now. In future we'll try to enable a diverse tool usage.

In [12]:
actor_agent = Avatar(
    tools=tools,
    signature=ToolQASignature,
    verbose=False,
    max_iters=20
)

Please use standard predictors, e.g. dspy.Predict and dspy.ChainOfThought.
They now support type annotations and other features of TypedPredictors and tend to work much better out of the box.
Please let us know if you face any issues: https://github.com/stanfordnlp/dspy/issues


## Evaluation

Open enden QA tasks are hard to evaluate on rigid metrics like exact match. So, we'll be using an improvised LLM as Judge for the evaluation of our model on test set.

In [13]:
class Evaluator(dspy.Signature):
    """Please act as an impartial judge to evaluate whether the answer is correct based on the ground truth answer"""
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
    )
    reference_answer: str = dspy.InputField(
        prefix="Ground Truth Answer:",
        desc="Ground truth answer to the question.",
    )
    answer: str = dspy.InputField(
        prefix="Answer:",
        desc="Answer to the question given by the model.",
    )
    rationale: str = dspy.OutputField(
        prefix="Rationale:",
        desc="Explanation of why the answer is correct or incorrect.",
    )
    is_correct: float = dspy.OutputField(
        prefix="Correct:",
        desc="Whether the answer is correct. Give 0 if incorrect, 1 if correct, (0, 1) if partially correct.",
    )


evaluator = dspy.TypedPredictor(Evaluator)


def metric(example, prediction, trace=None):
    acc = float(
        evaluator(
            question=example.question,
            answer=prediction.answer,
            reference_answer=example.answer
        ).is_correct
    ) 
    print(prediction.answer, '|', example.answer, '=>', acc)
    return acc

In [14]:
print(toolqa_train[0])
metric(toolqa_train[0], prediction=dspy.Example(answer='73 days'))

 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb


Example({'question': 'How many dates in the agenda table have Alexander scheduled?', 'answer': '73'}) (input_keys={'question', 'paper_id'})
73 days | 73 => 1.0


1.0

For evaluation we can't use `dspy.Evaluate`, reason being that `Avatar` changes it's signature per iteration by adding the actions and it's results to it as fields. So we can create our own hacky thread safe evaluator for it.

In [15]:
import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "False"

from concurrent.futures import ThreadPoolExecutor

def process_example(example, signature):
    try:
        avatar = Avatar(
            signature,
            tools=tools,
            verbose=False,
            max_iters=20
        )
        prediction = avatar(**example.inputs().toDict())

        return metric(example, prediction)
    except Exception as e:
        print(e)
        return 0

# process_example(tool_qa[0], ToolQASignature)
def multi_thread_executor(test_set, signature, num_threads=60):
    total_score = 0
    total_examples = len(test_set)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_example, example, signature) for example in test_set]

        for future in tqdm.tqdm(futures, total=total_examples, desc="Processing examples"):
            total_score += future.result()

    avg_metric = total_score / total_examples
    return avg_metric

def single_thread_executor(test_set, signature):
    total_score = 0
    total_examples = len(test_set)

    for example in tqdm.tqdm(test_set, desc="Processing examples"):
        total_score += process_example(example, signature)

    avg_metric = total_score / total_examples
    return avg_metric

## Optimization

For the optimization of the `Actor` we'll be using `AvatarOptimizer`. It's a DSPy implementation of the [Avatar](https://github.com/zou-group/avatar/) method that optimizes the `Actor` for the given `tools` using a comparator module that optimizes Actor instruction. Note, that Actor is the Module that directs tool execution and flow, it's not the signature that we are passing. It doesn't optimize the instruction of the signature we pass. It takes the following parameters:

* `metric`: Metric that we'll be optimizing for
* `max_iters`: Maximum number of iterations for the optimizer
* `lower_bound`: Lower bound for the metric to classify example as negative
* `upper_bound`: Upper bound for the metric to classify example as positive
* `max_positive_inputs`: Maximum number of positive inputs sampled for comparator
* `max_negative_inputs`: Maximum number of negative inputs sampled for comparator
* `optimize_for`: Whether we want to maximize the metric or minimize it during optimization

Once the optimizer is done we can get the optimized actor and use it for the evaluation.

In [16]:
from dspy.teleprompt import AvatarOptimizer

teleprompter = AvatarOptimizer(
    metric=metric,
    max_iters=2,
    max_negative_inputs=20,
    max_positive_inputs=20,
    upper_bound=0,
    lower_bound=0 # special case when we need some perturbation if no pos/neg
)

In [17]:
optimized_actor_agent = teleprompter.compile(
    student=actor_agent,
    trainset=toolqa_train
)

Iteration 1/2


Processing examples:   0%|          | 0/40 [00:00<?, ?it/s]

[query_llm] How many people are between 2:30 PM and 3:30 PM on 2022/04/10 in the agenda table?
0 | 9 => 0.0
[query_llm] What events does Jade have on 2022/01/25 in the agenda table?
Jade has no events on 2022/01/25 in the agenda table. | Kids concert => 0.0
[query_llm] Check Layla's availability on 2022/11/26 from 9:00 AM to 6:00 PM in the agenda table.
Layla is available for a meeting on 2022/11/26 from 9:00 AM to 6:00 PM as there are no scheduled events on that day in the agenda table. | 9:00 AM-2:00 PM, 5:00 PM-6:00 PM => 0.0
[query_llm] Count the number of events on 2022/03/09 in the agenda table.
0 | 24 => 0.0
[query_llm] How many dates in the agenda table have Faith scheduled?
5 | 85 => 0.0
[query_llm] How many dates in the agenda table have Layla scheduled?
Action 1: {"tool_name":"Finish","tool_input_query":"5"}

Result 1: Gathered all information needed to finish the task.

Answer: 5 | 74 => 0.0
[query_llm] Count the number of events on 2022/05/22 in the agenda table.
0 | 18 =>

Processing examples:   2%|▎         | 1/40 [00:15<10:01, 15.42s/it]

[query_llm] How many dates in the agenda table have Alexander scheduled?
5 | 73 => 0.0
[query_llm] How many dates in the agenda table have Jasmine scheduled?
4 | 86 => 0.0
[query_llm] What events does Stephen have on 2022/05/22 in the agenda table?
Stephen has no events scheduled on 2022/05/22 in the agenda table. | Employee team building event => 0.0
[query_llm] How many dates in the agenda table have Andrew scheduled?
5 | 88 => 0.0
[query_llm] Number of people between 3:00 PM and 4:00 PM on 2022/01/19 in the agenda table
[query_llm] Count the number of events on 2022/05/30 in the agenda table.
0 | 3 => 0.0
0 | 33 => 0.0
[query_llm] Number of people between 8:00 PM and 9:00 PM on 2022/09/15 in the agenda table
0 | 8 => 0.0
[query_llm] Check Emma's availability on 2022/06/14 between 9:00 AM and 6:00 PM in the agenda table.
Emma is available for a meeting on 2022/06/14 between 10:00 AM and 6:00 PM. | 9:00 AM-6:30 PM, 7:30 PM-6:00 PM => 0.5
[query_llm] What events does Elsie have on 2022

Processing examples:  15%|█▌        | 6/40 [00:46<04:11,  7.39s/it]

[query_llm] Count the number of people scheduled between 12:00 PM and 2:00 PM on 2022/09/13 in the agenda table.
0 | 11 => 0.0
[query_llm] What events does Millie have on 2022/12/07 in the agenda table?
[query_llm] How many people are between 2:00 PM and 4:00 PM on 2022/03/17 in the agenda table?
Millie does not have any events on 2022/12/07 in the agenda table. | Theatre play => 0.0
[query_llm] How many dates in the agenda table have Hector scheduled?


Processing examples:  22%|██▎       | 9/40 [00:51<02:33,  4.95s/it]

There are no events scheduled between 2:00 PM and 4:00 PM on 2022/03/17 in the agenda table, so the number of people is 0. | 10 => 0.0
4 | 92 => 0.0
[query_llm] Count the number of events on 2022/10/11 in the agenda table.
0 | 23 => 0.0
[query_llm] How many people are unavailable between 1:00 PM and 3:00 PM on 2022/06/17 in the agenda table?
I'm sorry, but the retrieved data does not provide information about the number of people unavailable between 1:00 PM and 3:00 PM on 2022/06/17. Please check the agenda table for the specific date and time. | 6 => 0.0
[query_llm] What events does David have on 2022/05/28 in the agenda table?
David has no events scheduled on 2022/05/28 in the agenda table. | Art Exhibition => 0.0
[query_llm] Count the number of events on 2022/09/25 in the agenda table.
0 | 24 => 0.0
[query_llm] How many dates in the agenda table have Chloe scheduled?
[query_llm] Count the number of events on 2022/04/27 in the agenda table.
3 | 88 => 0.0
0 | 20 => 0.0
[query_llm] Wha

Processing examples: 100%|██████████| 40/40 [00:53<00:00,  1.34s/it]

There are no events listed on 2022/12/01 in the agenda table. | 29 => 0.0
5 | 95 => 0.0
[query_llm] How many dates in the agenda table have Sophie scheduled?
3 | 83 => 0.0
Average Score: 0.0125
Positive examples: 40
Negative examples: 39
Sampling 20 positive examples and 20 negative examples





Generated new instruction: New Instruction: To effectively accomplish the `Goal` using the provided `Tools`, begin by carefully analyzing the user query to determine the most appropriate tool and input values. Retain the general guideline of outputting an `Action` that specifies the tool to use and the input query to pass to the tool. You may choose to use no tools and provide the final answer directly if the query is straightforward. Additionally, you can use a tool multiple times with different input queries if necessary.

To improve performance on negative inputs, enhance the specificity and clarity of your tool input queries. Implement a more sophisticated natural language processing (NLP) system to better parse and understand user queries, ensuring that the input queries are accurately formed and relevant to the task. This will help in retrieving precise information, especially for queries that are broad or not clearly defined. Additionally, ensure that the agenda data is well-str

Processing examples:   0%|          | 0/40 [00:00<?, ?it/s]

[query_llm] Count the number of events on 2022/12/01 in the agenda table.
[query_llm] Retrieve events for Millie on 2022/12/07 from the agenda table.
[query_llm] Check Patrick's availability on 2022/07/26 between 9:00 AM and 6:00 PM in the agenda table.
[query_llm] Retrieve the number of people scheduled between 8:00 PM and 9:00 PM on 2022/09/15 from the agenda table.
[query_llm] Count the number of dates in the agenda table where Sophie has scheduled events.
[query_llm] Count the number of dates Alexander has scheduled in the agenda table.
[query_llm] Count the number of events on 2022/03/09 in the agenda table.
[query_llm] Check Emma's availability on 2022/06/14 between 9:00 AM and 6:00 PM in the agenda table.
[query_llm] Count the number of unique dates in the agenda table where Henry has scheduled events.
There are no events listed on 2022/12/01 in the agenda table. | 29 => 0.0
[query_llm] Count the number of dates Jasmine has scheduled in the agenda table.
[query_llm] Count the nu

Processing examples:   2%|▎         | 1/40 [00:35<22:54, 35.25s/it]

4 | 95 => 0.0
Sophie has scheduled events on 3 dates. | 83 => 0.0
Alexander has 0 dates scheduled in the agenda table. | 73 => 0.0
There are no events listed on 2022/01/10 in the agenda table. | 24 => 0.0
Jasmine has 4 dates scheduled in the agenda table. | 86 => 0.0
Patrick is available for a meeting on 2022/07/26 between 9:00 AM and 6:00 PM as there are no scheduled events for that day in the agenda table. | 9:00 AM-2:00 PM, 4:00 PM-6:00 PM => 0.0
There are no events listed on 2022/03/09 in the agenda table. | 24 => 0.0
[query_llm] Count the number of events on 2022/04/27 in the agenda table.
[query_llm] Retrieve events for David on 2022/05/28 from the agenda table.
[query_llm] Find the number of people scheduled between 2:30 PM and 3:30 PM on 2022/04/10 in the agenda table.
David has no events scheduled on 2022/05/28 in the agenda table. | Art Exhibition => 0.0
There are no events listed on 2022/04/27 in the agenda table. | 20 => 0.0
There is no information available for events sche

Processing examples:   5%|▌         | 2/40 [00:55<16:39, 26.31s/it]

There are no events listed on 2022/05/22 in the agenda table. | 18 => 0.0
No events found for Elsie on 2022/04/17 in the agenda table. | Casino night => 0.0
There are no events listed on 2022/05/06 in the agenda table. | 21 => 0.0
Millie is available for a meeting on 2022/01/01 between 9:00 AM and 6:00 PM as there are no scheduled events for her on that date. | 9:00 AM-10:00 AM, 1:00 PM-6:00 PM => 0.0
[query_llm] List all events and people scheduled between 10:00 AM and 12:00 PM on 2022/06/23 in the agenda table.
0 | 3 => 0.0
There is no information available for people unavailable between 1:00 PM and 3:00 PM on 2022/06/17 in the agenda table. | 6 => 0.0
There is no information available for events between 8:00 PM and 9:00 PM on 2022/09/15 in the agenda table. | 8 => 0.0
There is no information available for the specified date and time range in the agenda table. | 10 => 0.0
[query_llm] Check the agenda table for any events scheduled between 10:00 AM and 12:00 PM on 2022/06/23 and list 

Processing examples: 100%|██████████| 40/40 [01:13<00:00,  1.83s/it]

There is no information available for events scheduled between 10:00 AM and 12:00 PM on 2022/06/23 in the agenda table. | 7 => 0.0
Average Score: 0.0125
Positive examples: 40
Negative examples: 39
Sampling 20 positive examples and 20 negative examples





Generated new instruction: New Instruction: To effectively accomplish the `Goal` using the provided `Tools`, begin by carefully analyzing the user query to determine the most appropriate tool and input values. Retain the general guideline of outputting an `Action` that specifies the tool to use and the input query to pass to the tool. You may choose to use no tools and provide the final answer directly if the query is straightforward. Additionally, you can use a tool multiple times with different input queries if necessary. 

To improve performance on negative inputs, focus on enhancing the specificity and clarity of your tool input queries. Implement a more sophisticated natural language processing (NLP) system to better parse and understand user queries, ensuring that the input queries are accurately formed and relevant to the task. This will help in retrieving precise information, especially for queries that are broad or not clearly defined. Encourage the use of more specific and de

Now we can evaluate our actor module, for this we've provided an implementation of thread safe evaluator that we above as part of class method of `AvatarOptimizer`.

In [18]:
teleprompter.thread_safe_evaluator(toolqa_test, optimized_actor_agent)

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm][query_llm] Count the number of unique dates in the agenda table where Christopher has scheduled events.
 Check Madison's availability on 2022/12/13 between 9:00 AM and 6:00 PM in the agenda table.
[query_llm] Retrieve events for Andrew on 2022/06/12 from the agenda table.
[query_llm] Check Amelia's availability on 2022/10/18 between 9:00 AM and 6:00 PM in the agenda table.
[query_llm] Count the number of dates in the agenda table where Imogen has scheduled events.
[query_llm] Count the number of events on 2022/01/10 in the agenda table.
There are no events listed on 2022/01/10 in the agenda table. | 24 => 0.0
[query_llm] Count the number of events scheduled on 2022/01/31 in the agenda table.
[query_llm] Find events for James on 2022/10/23 in the agenda table.
[query_llm] Find events for George on 2022/07/03 in the agenda table.
[query_llm] Retrieve events for Lily on 2022/04/07 from the agenda table.
[query_llm] Count the number of events on 2022/01/06 in the agenda table.


Processing examples:   2%|▏         | 1/60 [00:47<46:29, 47.27s/it]

Christopher has scheduled events on 3 unique dates. | 86 => 0.0
There are no events listed on 2022/01/06 in the agenda table. | 30 => 0.0
Andrew has no events scheduled on 2022/06/12 in the agenda table. | Artisanal candy making class => 0.0
Amelia is available all day on 2022/10/18 between 9:00 AM and 6:00 PM as there are no events listed for that date. | 1:00 PM-6:00 PM, 8:00 PM-6:00 PM => 0.0
Hector is available all day on 2022/01/12 between 9:00 AM and 6:00 PM. | 9:00 AM-9:00 PM, 11:00 PM-6:00 PM => 0.0
Madison is available for a meeting on 2022/12/13 between 9:00 AM and 6:00 PM as there are no scheduled events for that day in the agenda table. | 9:00 AM-2:00 PM, 4:00 PM-6:00 PM => 0.0
Lucy is available all day on 2022/10/09 between 9:00 AM and 6:00 PM as there are no conflicting events in the agenda table. | 9:00 AM-11:00 AM, 12:00 PM-6:00 PM => 0.0
[query_llm] Check Lily's availability on 2022/02/17 between 9:00 AM and 6:00 PM in the agenda table.
[query_llm] Check Daisy's availa

Processing examples:   3%|▎         | 2/60 [01:35<46:18, 47.90s/it]

Alice is available all day on 2022/01/06 between 9:00 AM and 6:00 PM. | 9:00 AM-1:00 PM, 3:00 PM-6:00 PM => 0.0
[query_llm] Retrieve the number of people scheduled for any events between 9:00 AM and 1:00 PM on 2022/10/06 from the agenda table.
[query_llm] Retrieve the number of people scheduled between 7:00 PM and 9:00 PM on 2022/10/01 from the agenda table.
'3' | 83 => 0.0
Amelia has no events on 2022/09/18 in the agenda table. | Yoga and meditation => 0.0
[query_llm] Search for any scheduled events for Florence in the agenda table and count the number of unique dates.
[query_llm] Retrieve events for Benjamin on 2022/06/10 from the agenda table, ensuring the date and person match exactly.
[query_llm] List all events and people scheduled between 8:00 AM and 12:00 PM on 2022/12/26 in the agenda table.
There are no events listed for 2022/07/15 in the agenda table. | 27 => 0.0
[query_llm] Retrieve the number of people scheduled between 3:00 PM and 4:00 PM on 2022/05/27 from the agenda tab

Processing examples:  22%|██▏       | 13/60 [01:37<03:46,  4.82s/it]

Faith is available for a meeting on 2022/04/01 between 9:00 AM and 6:00 PM as there are no conflicting events listed in the agenda table. | 9:00 AM-8:00 PM, 10:00 PM-6:00 PM => 1.0
[query_llm] List all events and people scheduled between 9:00 AM and 3:00 PM on 2022/06/20 in the agenda table.


Processing examples:  30%|███       | 18/60 [01:40<02:16,  3.24s/it]

There is no information available for events scheduled between 7:00 PM and 9:00 PM on 2022/10/01 in the agenda table. | 9 => 0.0
Benjamin has no events on 2022/06/10 in the agenda table. | Join a fitness bootcamp => 0.0
There is no data available for events scheduled between 3:00 PM and 4:00 PM on 2022/05/27 in the agenda table. | 4 => 0.0
Florence has events scheduled on 5 unique dates in the agenda table. | 69 => 0.0
[query_llm] Check the agenda table for any events scheduled between 8:00 AM and 12:00 PM on 2022/12/26 and list the people involved.
There is no information available for events scheduled between 9:00 AM and 1:00 PM on 2022/10/06 in the agenda table. | 7 => 0.0


Processing examples:  35%|███▌      | 21/60 [01:43<01:45,  2.69s/it]

No events found for 2022/06/20 between 9:00 AM and 3:00 PM in the agenda table. | 14 => 0.0


Processing examples: 100%|██████████| 60/60 [01:45<00:00,  1.75s/it]

There are no people scheduled between 8:00 AM and 12:00 PM on 2022/12/26 in the agenda table. | 9 => 0.0





0.041666666666666664

## ReAct

In [19]:
from langchain_core.tools import Tool
from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper


tools = [
    Tool(
        func=StructuredTool.from_function(RETRIEVE),
        name="RETRIEVE",
        description="If you want to search for some personal agenda information, you can use this tool and input a natural language query. For example, RETRIEVE('What is the Jessica's genda on March 7th, 2023?') returns 'Jessica has a job interview scheduled for July 26th, 2022 at 1 PM...'."
    ),
    Tool(
        func=GoogleSerperAPIWrapper().run,
        name="WEB_SEARCH",
        description="If you have a question, you can use this tool to search the web for the answer."
    )
]

In [20]:
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent, load_tools
from langchain_openai import ChatOpenAI
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_core.tools import Tool

llm = ChatOpenAI(
    model_name="gpt-4o",
    temperature=0.0
)
prompt = hub.pull("hwchase17/react")

agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False, handle_parsing_errors=True)

def metric(example, prediction, trace=None):
    return int(
        evaluator(
            question=example.question,
            answer=prediction["output"],
            reference_answer=example.answer
        ).is_correct
    )

def get_metric_value(example):
    prediction = agent_executor.invoke(
        {
            "input": example.question
        }
    )
    return metric(example, prediction)

def process_example(example):
    try:
        return get_metric_value(example)
    except Exception as e:
        print(e)
        return 0

def multi_thread_executor(test_set, num_threads=100):
    total_score = 0
    total_examples = len(test_set)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_example, example) for example in test_set]

        for future in tqdm.tqdm(futures, total=total_examples, desc="Processing examples"):
            total_score += future.result()

    avg_metric = total_score / total_examples
    return avg_metric

In [21]:
multi_thread_executor(toolqa_test)

Processing examples:   0%|          | 0/60 [00:00<?, ?it/s]

[query_llm] How many dates in the agenda table have Elsie scheduled?
[query_llm] How many dates in the agenda table have Florence scheduled?
[query_llm] What is the agenda on March 13th, 2022, between 12:00 PM and 3:00 PM?
[query_llm] What events are scheduled on 2022/01/06 in the agenda table?
[query_llm] How many dates in the agenda table have Christopher scheduled?
[query_llm] What events does Brian have on 2022/03/17 in the agenda table?
[query_llm] What is Madison's agenda on December 13th, 2022?
[query_llm] How many dates in the agenda table have Charles scheduled?
[query_llm] How many dates in the agenda table have Summer scheduled?
[query_llm] What is the agenda on October 6th, 2022, between 9:00 AM and 1:00 PM?
[query_llm] How many dates in the agenda table have Hannah scheduled?
[query_llm] How many dates in the agenda table have Harper scheduled?
[query_llm] What is Alice's agenda on January 6th, 2022?
[query_llm] What is Victoria's agenda on August 5th, 2022?
[query_llm] Wh

Processing examples:   2%|▏         | 1/60 [01:04<1:03:22, 64.45s/it]

[query_llm][query_llm] What is the agenda on June 7th, 2022?
 What events does Benjamin have on 2022/06/10 in the agenda table?
[query_llm] What events are scheduled on 2022/01/10 in the agenda table?
[query_llm] What is Andrew's agenda on June 12th, 2022?
[query_llm] What events are scheduled on 2022/12/18 in the agenda table?
[query_llm] What is Joshua's agenda on December 2nd, 2022?
[query_llm] What is the agenda on March 13th, 2022?
[query_llm] What events are scheduled on September 20th, 2022 in the agenda table?
[query_llm] What events does Brian have on March 17th, 2022 in the agenda table?
[query_llm] What is Summer's agenda on April 7th, 2022?
[query_llm] What is the agenda on October 6th, 2022?
[query_llm] What events does James have on 2022/11/13 in the agenda table?
[query_llm] What events are scheduled on January 31st, 2022 in the agenda table?
[query_llm] What is Amelia's agenda on September 18th, 2022?
[query_llm] What events are scheduled on July 15th, 2022 in the agend

Processing examples:   3%|▎         | 2/60 [02:16<1:06:40, 68.97s/it]

[query_llm] What is Joshua's agenda on December 2nd, 2022?
[query_llm] What events are scheduled on March 13th, 2022, between 12:00 PM and 3:00 PM?
[query_llm] What events are scheduled on October 6th, 2022, between 9:00 AM and 1:00 PM?
[query_llm] What events are scheduled on January 31st, 2022 in the agenda table?
[query_llm] What events are scheduled on July 15th, 2022 in the agenda table?
[query_llm] What is Jessica's agenda on November 28th, 2022?
[query_llm] What is the agenda on December 19th, 2022?
[query_llm] What is Amelia's agenda on October 18th, 2022?
[query_llm] What is Elizabeth's agenda on January 10th, 2022?
[query_llm] Agenda details for February 10th, 2022, between 6:30 PM and 8:00 PM
[query_llm] What is Georgina's agenda on March 20th, 2022?
[query_llm] What is Hector's agenda on January 12th, 2022?
[query_llm] What is the agenda on June 21st, 2022?
[query_llm] What is Lucy's agenda on October 9th, 2022?
[query_llm] What is Lily's agenda on April 7th, 2022?
[query_l

Processing examples:  27%|██▋       | 16/60 [02:55<04:34,  6.23s/it] 

[query_llm] What events are scheduled on June 20th, 2022, between 9:00 AM and 3:00 PM?


Processing examples: 100%|██████████| 60/60 [03:00<00:00,  3.00s/it]


0.03333333333333333