### load data

In [2]:
import joblib
# test dataset used by retroformer and expel
hotpot_test = joblib.load('hotpot-qa-distractor-sample.joblib').reset_index(drop = True)

In [3]:
import dspy
import os
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

# colbert = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(
    lm=dspy.OpenAI(
        model="gpt-4o", # "gpt-4o"
        api_key=os.getenv("OPENAI_API_KEY"),
        max_tokens=2048,
        temperature=0
    )
)

dataset = HotPotQA(train_seed=1, train_size=100, dev_size=0, test_size=0)
trainset = [x.with_inputs('question') for x in dataset.train]
test_set = [
    dspy.Example(question=hotpot_test.iloc[i]['question'], answer=hotpot_test.iloc[i]['answer']).with_inputs("question", "paper_id")
    for i in range(len(hotpot_test))
]

# show an example datapoint; it's just a question-answer pair
trainset[0], test_set[0]

(Example({'question': 'At My Window was released by which American singer-songwriter?', 'answer': 'John Townes Van Zandt'}) (input_keys={'question'}),
 Example({'question': "VIVA Media AG changed it's name in 2004. What does their new acronym stand for?", 'answer': 'Gesellschaft mit beschränkter Haftung'}) (input_keys={'paper_id', 'question'}))

### ReAct

In [4]:
# from langchain_core.tools import Tool
# from dspy.predict.parameter import Parameter
# from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper


# class WebSearch(Parameter):
#     name = "WEB_SEARCH"
#     input_variable = "query"
#     desc = "If you have a question, you can use this tool to search the web for the answer."

#     def forward(
#         self,
#         query_or_queries):
#         return GoogleSerperAPIWrapper().run(query_or_queries)

# # set attribute input_variable to google_tool
# agent = dspy.ReAct("question -> answer", tools=[dspy.Retrieve(k=3), WebSearch()])

In [5]:
# config = dict(num_threads=8, display_progress=True, display_table=5)
# evaluate = Evaluate(test_set=test_set, metric=dspy.evaluate.answer_exact_match, **config)

# evaluate(agent)

### AvaTaR

In [4]:
class HotPotQASignature(dspy.Signature):
    """You will be given a question. Your task is to answer the question."""
    
    question: str = dspy.InputField(
        prefix="[[Question]]: ",
        desc="question to ask",
        format=lambda x: x.strip(),
    )
    rationale: str = dspy.OutputField(
        prefix="[[Rationale]]: ",
        desc="Explanation or supporting evidence for the answer",
    )
    answer: str = dspy.OutputField(
        prefix="[[Answer]]: ",
        desc="A short and direct answer (less than 10 words) to the question",
    )


In [5]:
from dspy.predict.avatar import Tool, Avatar
from langchain_community.utilities import GoogleSerperAPIWrapper, ArxivAPIWrapper, WikipediaAPIWrapper
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_community.tools import WikipediaQueryRun


def retrieve_from_wiki(query: str) -> str:
    """If you have a question or name to lookup, this tool uses wikipedia search for the answer."""
    return WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()).run(query)


tools = [
    Tool(
        tool=StructuredTool.from_function(retrieve_from_wiki),
        name="Wiki_SEARCH",
        desc="If you have a question or name to lookup, this tool uses wikipedia search for the answer."
    ),
    Tool(
        tool=GoogleSerperAPIWrapper(),
        name="WEB_SEARCH",
        desc="If you have a question, this tool uses google search for the answer."
    )
]


In [6]:
avatar_agent = Avatar(
    tools=tools,
    signature=HotPotQASignature,
    verbose=False,
    max_iters=10
)

Please use standard predictors, e.g. dspy.Predict and dspy.ChainOfThought.
They now support type annotations and other features of TypedPredictors and tend to work much better out of the box.
Please let us know if you face any issues: https://github.com/stanfordnlp/dspy/issues


## Evaluation

Open enden QA tasks are hard to evaluate on rigid metrics like exact match. So, we'll be using an improvised LLM as Judge for the evaluation of our model on test set.

In [7]:
import re, string

class Evaluator(dspy.Signature):
    """Please act as an impartial judge to evaluate whether the answer is correct based on the ground truth answer"""
    
    question: str = dspy.InputField(
        prefix="Question:",
        desc="question to ask",
    )
    reference_answer: str = dspy.InputField(
        prefix="Ground truth answer:",
        desc="Ground truth answer to the question.",
    )
    answer: str = dspy.InputField(
        prefix="Answer:",
        desc="Answer to the question given by the model.",
    )
    rationale: str = dspy.OutputField(
        prefix="Rationale:",
        desc="Explanation of why the model's answer is correct or incorrect according to the ground truth answer.",
    )

evaluator = dspy.TypedPredictor(Evaluator)
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)
    
    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def EM(answer, key) -> bool:
    return normalize_answer(answer) == normalize_answer(key)

def metric(example, prediction, trace=None):
    result = EM(prediction.answer.lower().strip(), example.answer.lower().strip())
    print(prediction.answer.lower().strip(), '|', example.answer.lower().strip(), '=> ', result)
    return result


For evaluation we can't use `dspy.Evaluate`, reason being that `Avatar` changes it's signature per iteration by adding the actions and it's results to it as fields. So we can create our own hacky thread safe evaluator for it.

In [8]:
import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "False"

from concurrent.futures import ThreadPoolExecutor

def process_example(example, signature):
    try:
        avatar = Avatar(
            signature,
            tools=tools,
            verbose=False,
            max_iters=10
        )
        prediction = avatar(**example.inputs().toDict())

        return metric(example, prediction)
    except Exception as e:
        print(e)
        return 0

# process_example(tool_qa[0], ToolQASignature)
def multi_thread_executor(test_set, signature, num_threads=100):
    total_score = 0
    total_examples = len(test_set)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_example, example, signature) for example in test_set]

        for future in tqdm.tqdm(futures, total=total_examples, desc="Processing examples"):
            total_score += future.result()

    avg_metric = total_score / total_examples
    return avg_metric


In [29]:
score = multi_thread_executor(test_set, HotPotQASignature)

 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb
 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb
Processing examples:   0%|          | 0/100 [00:00<?, ?it/s]

alexander mcnair was from missouri. | bath, maine =>  False
stedelijk museum | stedelijk museum amsterdam =>  False
aloe vera of america | aloe vera of america =>  True
california | california =>  True
read it and weep | "read it and weep" (2006) =>  False
enigma with the song "amen". | enigma =>  False
"shukratara mand vara", "shapat tula aahe" | shukratara =>  False
lester | lester =>  True
minnesota | minnesota =>  True
german culture | german =>  False
2016 cannes film festival | 69th cannes film festival =>  False
sarah kerrigan | sarah kerrigan =>  True
creature comforts | creature comforts =>  True
baltimore, maryland, u.s. | the port of baltimore west to sandy hook =>  False
bhaktivedanta manor. | in the village of aldenham =>  False
engineering | engineering =>  True


Processing examples:   1%|          | 1/100 [00:17<28:27, 17.25s/it]

albany | new york =>  False
1978 | 1978 =>  True
oklahoma sooners | oklahoma sooners =>  True
1946 | 1927 =>  False
gmbh | gesellschaft mit beschränkter haftung =>  False
teen titans go! | teen titans go! =>  True
duck dance | the twist =>  False
m. night shyamalan | m. night shyamalan =>  True
savannah, georgia is not in europe. | lacoste, france =>  False
allegiant stadium | sam boyd stadium =>  False
the dark tower series by stephen king | the dark tower =>  False
2004 | 2004 =>  True


Processing examples:   3%|▎         | 3/100 [00:18<06:47,  4.20s/it]

marktown | marktown =>  True
jonny craig | jonny" craig =>  True
anne perry | anne perry =>  True
shakespeare's works include 39 plays, 154 sonnets, and poems. | chronological collection of critical quotations =>  False
captain hans geering | captain hans geering =>  True
sir patrick vallance | frederick alexander =>  False
bluegrass airport | blue grass airport =>  False
oneida limited | oneida limited =>  True
cannot determine without specific information on rock nominees ltd. | cleaning, catering and security =>  False
2 march 1972 | 2 march 1972 =>  True
loughborough university | loughborough university =>  True
"you're next" was filmed first. | you're next =>  False
discovery zone | discovery zone =>  True
2011 pulitzer prize in nonfiction. | pulitzer prize =>  False
adam levine | adam levine =>  True
juliet capulet | tybalt =>  False
the battle of white plains | new york and new jersey campaign =>  False
papa gino's | papa gino's =>  True
ricky rubio | ricard rubio i vives =>  Fa

Processing examples:   5%|▌         | 5/100 [00:26<06:33,  4.14s/it]

swoop the eagle and dooley the skeleton. | the bears =>  False
love and theft | love and theft =>  True
musicians and songwriters. | singer, songwriter =>  False
1981 | 1983 =>  False
chinese coffee | chinese coffee =>  True
empire distribution is based in the united states. | san francisco, california =>  False
women's interest magazines. | fortnightly women interest magazine =>  False
the bad hemingway contest | the bad hemingway contest =>  True
indiana university | indiana university =>  True
seven schools. | six =>  False
roman empire | roman =>  False
muhammad ali fought jimmy ellis. | jimmy ellis =>  False
matt kemp | matthew ryan kemp =>  False
authors | novelist =>  False
cannot determine from provided data. | matt groening =>  False
molly hatchet. | molly hatchet =>  True
no, only maxillaria is a genus of orchids. | no =>  False
square enix | crystal dynamics =>  False
steve perry | ronald shusett =>  False
no, one is american, one is french. | no =>  False
daredevil | darede

Processing examples:   6%|▌         | 6/100 [00:34<08:24,  5.37s/it]

velvetpark magazine. | velvetpark =>  False
austrian chancellor engelbert dollfuss | a failed coup attempt =>  False
j.r.r. tolkien | j. r. r. tolkien =>  False
magical english nanny | fictional character =>  False


Processing examples:  42%|████▏     | 42/100 [00:38<00:24,  2.35it/s]

stone brewing | stone brewing =>  True
1998 | 1998 =>  True
grafton monster | dewey lake monster =>  False


Processing examples:  47%|████▋     | 47/100 [00:43<00:27,  1.90it/s]

no, they are not both based in massachusetts. | no =>  False
novelists | author =>  False


Processing examples: 100%|██████████| 100/100 [01:07<00:00,  1.48it/s]

to squarepants or not to squarepants | to squarepants or not to squarepants =>  True





In [30]:
print(f"Average Score before optimization: {score:.2f}")

Average Score before optimization: 0.40


## Optimization


In [9]:
from dspy.teleprompt import AvatarOptimizer
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

teleprompter = AvatarOptimizer(
    metric=metric,
    max_iters=1,
    max_negative_inputs=20,
    max_positive_inputs=20,
)

In [10]:
avatar_agent = teleprompter.compile(
    student=avatar_agent,
    trainset=trainset
)

 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb
 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb


Iteration 1/1


Processing examples:   0%|          | 0/100 [00:00<?, ?it/s]

belgium, europe | brussels =>  False
jeremy paxman was born in 1950. | 1950 =>  False
rosario dawson | rosario dawson =>  True
kerry condon | kerry condon =>  True
1979 | 1979 =>  True
bruno senna | bruno senna lalli =>  False
tae kwon do times | tae kwon do times =>  True
the shadows | the shadows =>  True
21 people in 2003. | 7 =>  False
clydesdale horse | clydesdales =>  False
douglas douglas-hamilton, 14th duke of hamilton | douglas douglas-hamilton, 14th duke of hamilton =>  True
tea-chest bass | tea-chest bass =>  True
jungle jim | jungle jim =>  True
big machine records | big machine records =>  True
james belushi, danny devito, charlie day, hugh laurie, billy bob thornton. | bill murray =>  False
the last shadow puppets had a longer hiatus. | the last shadow puppets =>  False
joe torre was born in 1940. | 1940 =>  False
50 million years old. | 50-million-year-old fossil record =>  False
space. | space =>  True
gustavo kuerten | gustavo kuerten =>  True
operation citadel (untern

Processing examples:   1%|          | 1/100 [00:08<14:14,  8.63s/it]

2005 | 2005 =>  True
pond hockey was created first. | pond hockey =>  False
townes van zandt | john townes van zandt =>  False
1882 | 1874 =>  False
scott hayden | scott hayden =>  True
moravian and slavic, including eastern european folk music. | moravian and other slavic folk music =>  False
fred hoiberg | fred hoiberg =>  True
enoch "nucky" johnson | enoch lewis "nucky" johnson =>  False
hunting purposes. | hunting =>  False
december 2009 for ios and maemo devices. | new zealand =>  False
aleem dar | aleem sarwar dar =>  False
17 january 1991 | on 17 january 1991 =>  False
atlantic ocean | atlantic =>  False
quatre polichinelles semblables | commedia dell'arte =>  False
boston bruins | providence bruins =>  False
variable oystercatcher, red-breasted dotterel, silver gull. | bryde's whales =>  False
bryan lee vera is older. | bryan lee vera =>  False
2010 | 2010 =>  True
baseball, football, and basketball. | rugby =>  False
waltz king | the waltz king =>  True
einstein–rosen bridge (

Processing examples:   3%|▎         | 3/100 [00:16<08:30,  5.26s/it]

georgia | georgia =>  True
selma lagerlöf | selma ottilia lovisa lagerlöf =>  False
self was most recently published. | self =>  False
aleksandr danilovich aleksandrov is older. | aleksandr danilovich aleksandrov =>  False
asmodee was founded in 1995. | 1995 =>  False
colorado springs airport | colorado springs municipal airport =>  False
averroes lived longer than al-ghazali. | ibn rushd =>  False
mixed martial artists. | mixed martial arts fighter =>  False
margot robbie | margot elise robbie =>  False


Processing examples:  12%|█▏        | 12/100 [00:18<01:36,  1.10s/it]

nick at nite, hollywood heights. | nick at nite =>  False
taste (umami flavor) | taste =>  False
buena vista distribution | buena vista distribution =>  True


Processing examples:  13%|█▎        | 13/100 [00:18<01:25,  1.02it/s]

sports specialist school. | science, technology and teaching =>  False
sidney lumet was nominated for more. | 14 of his films were nominated =>  False
battle of milk creek, 1879. | battle of milk creek =>  False
film directors | film director =>  False


Processing examples:  14%|█▍        | 14/100 [00:20<01:26,  1.00s/it]

american | american =>  True
boy george, lead singer of culture club. | george alan o'dowd =>  False
skilled and exciting outfield for the dodgers. | "outfield of dreams" =>  False
no movie is based on both books. | darby o'gill and the little people =>  False
august 23, 1988 | august 23, 1988 =>  True
no, they play different types of music. | no =>  False
no campus city matches population 50,046. | roskilde =>  False
no actor stars in both. | noah taylor =>  False
the world is not enough artist released first. | madonna =>  False
information not found in available sources. | river thames =>  False
metal | metal =>  True
don williams | don williams =>  True
no known association with an alternative rock band. | push th' little daisies =>  False
pixar animation studios | pixar animation studios =>  True
the devil and max devlin was released first. | the devil and max devlin =>  False
the ocean blue was formed farther east. | the ocean blue =>  False


Processing examples:  16%|█▌        | 16/100 [00:22<01:34,  1.12s/it]

1895 | 2010 =>  False
screwball comedy and literary drama. | anti-mccarthyism =>  False
joakim noah tied with tyson chandler. | tyson chandler =>  False
general justo josé de urquiza airport (pra / saap) | aerolíneas argentinas =>  False
approximately 22,673. | 23,674 =>  False


Processing examples:  43%|████▎     | 43/100 [00:27<00:17,  3.30it/s]

bringing out the dead | bringing out the dead =>  True
iron and steel. | iron and steel =>  True
lubbock, texas | lubbock, texas =>  True
yes, both are prestigious. | yes =>  False
kapilvastu, a significant buddhist site. | lumbini =>  False
information not clearly available. | tim kennedy =>  False
purchase cost details are not publicly available. | $250,000 =>  False


Processing examples:  44%|████▍     | 44/100 [00:46<01:03,  1.13s/it]

information not found; further research needed. | 18 days =>  False


Processing examples: 100%|██████████| 100/100 [01:58<00:00,  1.19s/it]

information not found. | 1996 =>  False
Average Score: 0.32
Positive examples: 32
Negative examples: 68
Sampling 20 positive examples and 20 negative examples





Generated new instruction: New Instruction: You will be given `Tools`, which will be a list of resources to use to accomplish the `Goal`. Your task is to evaluate the nature of the user query and decide which tool or combination of tools to use, along with the input values to provide. For straightforward, well-defined questions, continue using Wiki_SEARCH and WEB_SEARCH effectively, ensuring that if one tool does not yield results, the other is utilized. For more complex or ambiguous queries, consider using WEB_SEARCH more frequently, as it may provide more current or comprehensive information. Additionally, incorporate analytical tools for queries that require comparison or synthesis of information from multiple sources.

Develop a decision-making process that includes a fallback mechanism. If the initial tool does not provide satisfactory results, automatically switch to an alternative tool. Implement a system to cross-verify information from multiple sources to ensure accuracy and c

In [11]:
score = multi_thread_executor(test_set, HotPotQASignature)

m. night shyamalan | m. night shyamalan =>  True


Processing examples:   0%|          | 0/100 [00:00<?, ?it/s]

no, they are not both american. | no =>  False
march 2, 1972 | 2 march 1972 =>  False
baltimore and ohio railroad, baltimore, maryland. | the port of baltimore west to sandy hook =>  False
minnesota | minnesota =>  True
no, neither is based in massachusetts. | no =>  False
no, they are from different countries. | no =>  False
1927 | 1927 =>  True
1983 | 1983 =>  True
stedelijk museum in amsterdam. | stedelijk museum amsterdam =>  False
"you're next" was filmed first. | you're next =>  False
raffaella reggi was born first. | raffaella reggi =>  False
sarah kerrigan in "starcraft 2" trilogy. | sarah kerrigan =>  False
hawaii county | hawaii county =>  True
marktown | marktown =>  True
creature comforts | creature comforts =>  True
oneida limited | oneida limited =>  True
shakespeare's works are highly influential and widely performed. | chronological collection of critical quotations =>  False
london | london =>  True
super-regional shopping mall in nashville, tennessee. | super-regional

Processing examples:   1%|          | 1/100 [00:19<32:59, 19.99s/it]

only maxillaria is a genus of orchids. | no =>  False
anne perry | anne perry =>  True
2011 pulitzer prize in general nonfiction finalist. | pulitzer prize =>  False
viva is not an acronym. | gesellschaft mit beschränkter haftung =>  False
german culture | german =>  False
jonny craig has been in more bands. | jonny" craig =>  False
san francisco | san francisco, california =>  False
no common services identified. | cleaning, catering and security =>  False
virginia | virginia =>  True
world war i. | the cold war (1947–91) =>  False
1998 | 1998 =>  True
yes, both are film directors. | yes =>  False
kxii serves pontotoc county, oklahoma. | kxii =>  False
matthew abraham groening | matt groening =>  False
j. r. r. tolkien | j. r. r. tolkien =>  True
both are novelists. | novelist =>  False
oklahoma sooners | oklahoma sooners =>  True
no documented fight in houston after frazier. | jimmy ellis =>  False


Processing examples:   7%|▋         | 7/100 [00:45<09:06,  5.87s/it]

creed disbanded in june 2004. | 2004 =>  False


Processing examples: 100%|██████████| 100/100 [00:47<00:00,  2.11it/s]

supply chain management | engineering =>  False





Now we can evaluate our actor module, for this we've provided an implementation of thread safe evaluator that we above as part of class method of `AvatarOptimizer`.

In [13]:
print(f"Average Score: {score:.2f}")

Average Score: 0.31
