In [34]:
!pip install beautifulsoup4 -q

In [35]:
!pip install pyserini==0.21.0 -q
!pip install faiss-cpu==1.7.2 -q

In [36]:
!pip install accelerate -q

In [37]:
!pip install bitsandbytes -q

# IIRC

In [38]:
!wget https://iirc-dataset.s3.us-west-2.amazonaws.com/context_articles.tar.gz -q
!wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json -q
!tar -xf context_articles.tar.gz

## Data Preprocessing

In [39]:
import json

dev_data = json.load(open("/content/iirc_test.json", "r"))
context_articles = json.load(open("/content/context_articles.json", "r"))

In [40]:
from bs4 import BeautifulSoup
import json

# Load the JSON data
dev_data = json.load(open("/content/iirc_test.json", "r"))
context_articles = json.load(open("/content/context_articles.json", "r"))

documents = []
all_titles = []

# Gets content and title of passages
# Filters HTML tags using BeautifulSoup
for item in dev_data:
    if item['title'].lower() not in all_titles:
        documents.append({
                "title": BeautifulSoup(item['title'], 'html.parser').get_text().strip().lower(),
                "contents": BeautifulSoup(item["text"], 'html.parser').get_text().strip().lower()
            }
        )
        all_titles.append(BeautifulSoup(item["title"], 'html.parser').get_text().strip().lower())

    # Get content from question links in each question
    for question in item["questions"]:
        for link in question["question_links"]:
            if link.lower() in context_articles and link.lower() not in all_titles:
                documents.append({
                    "title": BeautifulSoup(link, 'html.parser').get_text().strip().lower(),
                    "contents": BeautifulSoup(context_articles[link.lower()], 'html.parser').get_text().strip().lower()
                })
                all_titles.append(BeautifulSoup(link, 'html.parser').get_text().strip().lower())

  "title": BeautifulSoup(link, 'html.parser').get_text().strip().lower(),
  all_titles.append(BeautifulSoup(link, 'html.parser').get_text().strip().lower())


In [41]:
print(documents[0])
print(documents[1])
print(documents[2])
print(documents[3])
print(documents[4])
print(documents[5])

{'title': 'palici', 'contents': "the palici (παλικοί in greek), or palaci, were a pair of indigenous sicilian chthonic deities in roman mythology, and to a lesser extent in greek mythology. they are mentioned in ovid's metamorphoses v, 406, and in virgil's aeneid ix, 585. their cult centered on three small lakes that emitted sulphurous vapors in the palagonia plain, and as a result these twin brothers were associated with geysers and the underworld. there was also a shrine to the palaci in palacia, where people could subject themselves or others to tests of reliability through divine judgement; passing meant that an oath could be trusted. the mythological lineage of the palici is uncertain; one legend made the palici the sons of zeus, or possibly hephaestus, by aetna or thalia, but another claimed that the palici were the sons of the sicilian deity adranus."}
{'title': 'zeus', 'contents': 'zeus (british english , north american english ; , zeús ) is the sky and thunder god in ancient g

In [42]:
def get_sliding_window(doc: dict, words_per_window: int, words_overlap: int) -> list:
    """
    Returns a list of texts with 'words_per_window' words overlapping 'words_overlap' words.
    Args:
        doc {title, contents}: the doc to be split
        words_per_window: number of words per window
        words_overlap: number of words to overlap
    Returns:
        A list of contents (string)
    """
    assert words_per_window > words_overlap, "words_per_window should be greater than words_overlap"
    words = doc["contents"].split(" ")
    windows = []
    for i in range(0, len(words), words_per_window - words_overlap):
        if i + words_per_window >= len(words):
            windows.append(" ".join(words[i:]))
            break
        window = " ".join(words[i:i+words_per_window])
        windows.append(window)
    return windows

In [43]:
from tqdm.auto import tqdm
import os
from uuid import uuid4

os.makedirs("iirc_docs", exist_ok=True)

# Creates a jsonl file with all contents to be indexed using the Pyserini's library
with open("iirc_docs/iirc_docs.jsonl", "w") as fout:
    count = 0
    for doc in tqdm(documents, desc="processing docs", total=len(documents)):
        for idx, window in enumerate(get_sliding_window(
            doc=doc,
            words_per_window=150,
            words_overlap=75
        )):
            doc_dict = {"id": count, "contents": window, "title": doc["title"]}
            count += 1
            fout.write(json.dumps(doc_dict) + "\n")

processing docs:   0%|          | 0/2028 [00:00<?, ?it/s]

## Questions and Gold data

In [44]:
import re

def clean_string(input_string):
    # Convert the string to lowercase
    lowercase_string = input_string.lower()
    # Remove multiple spaces with a single space
    cleaned_string = re.sub(r'\s+', ' ', lowercase_string)
    # Remove punctuation marks using regular expressions
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)

    return cleaned_string

def get_answer(answer):
    if answer["type"] == "span":
        return clean_string(answer["answer_spans"][0]["text"])
    elif answer["type"] == "value":
        return clean_string(f"{answer['answer_value']} {answer['answer_unit']}")
    elif answer["type"] == None:
        return "not enough information"
    elif answer["type"] == "binary":
        return clean_string(answer["answer_value"])

In [45]:
# Contains tuples with (question, gold_answer)
evaluation_dataset = []

for passage in dev_data:
    for question in passage["questions"]:
        query = question["question"]
        answer = get_answer(question["answer"])
        if answer != "not enough information" and answer is not None:
            evaluation_dataset.append((query, answer, passage["text"]))

In [46]:
with open("evaluation_dataset.json", "w") as fout:
    json.dump(evaluation_dataset, fout)

In [47]:
# Expand bm25 jsonl with main texts
from uuid import uuid4

known_texts = set()
with open("iirc_docs/iirc_docs.jsonl", "a") as fout:
    for _, _, passage in evaluation_dataset:
        hashed_text = hash(passage)
        if hashed_text not in known_texts:
            known_texts.add(hashed_text)
            fout.write(json.dumps({"id": uuid4().hex, "contents": passage, "title": "Main Passage"}) + "\n")

In [48]:
evaluation_dataset = evaluation_dataset[:50] # First 50 examples with answer

## BM25 Index

In [49]:
# Creates the BM25 index with Pyserini
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input iirc_docs \
  --index iirc \
  --language en\
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors --storeRaw

2024-06-20 02:18:05,562 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2024-06-20 02:18:05,564 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2024-06-20 02:18:05,565 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: iirc_docs
2024-06-20 02:18:05,565 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2024-06-20 02:18:05,567 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2024-06-20 02:18:05,568 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 1
2024-06-20 02:18:05,568 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2024-06-20 02:18:05,568 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2024-06-20 02:18:05,571 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Keep stopword

# Evaluation pipeline

## LLM

In [50]:
from google.colab import userdata

api_key = userdata.get("OPENAI_KEY")
groq_key = userdata.get("GROQ_KEY")

In [51]:
import requests
import time

def get_llm_response(prompt: str, system_prompt: str = None, use_openai: bool = False):
    """
    Send a prompt to ChatGPT and get its answer.
    Args:
        prompt (str): a string containing the prompt
        system_prompt (str): a string containing the system prompt
        use_openai (bool): whether to use openai or groq
    Returns:
        The answer and the request cost
    """
    for _ in range(15):
        try:
            data = {
                "model": "gpt-3.5-turbo" if use_openai else "llama3-70b-8192",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0,
                "top_p": 1
            }
            if system_prompt:
                data["messages"].insert(0, {"role": "system", "content": system_prompt})
            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {api_key if use_openai else groq_key}"
            }
            if use_openai:
                response = requests.post(
                    "https://api.openai.com/v1/chat/completions",
                    json=data,
                    headers=headers
                )
            else:
                response = requests.post(
                    "https://api.groq.com/openai/v1/chat/completions",
                    json=data,
                    headers=headers
                )
            if not response.ok:
                time.sleep(3)
                continue
            response = response.json()
            if use_openai:
                cost = 0.5e-6 * response["usage"]["prompt_tokens"]
                cost += 1.5e-6 * response["usage"]["completion_tokens"]
            else:
                cost = 0
            return response["choices"][0]["message"]["content"].strip().lower(), cost
        except Exception as e:
            raise("Error processing llm request:", e)

## Reranker - MonoT5

In [52]:
from math import exp
from typing import List

import torch
from tqdm.auto import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BatchEncoding,
    AutoModelForSeq2SeqLM
)

class T5Ranker():
    def __init__(self, model_name_or_path: str, fp8: bool = False):
        """
        Loads the T5 model from the given path.
        Args:
            model_name_or_path: path to the model
            fp8: whether the model should be loaded using FP8
        """
        self.name = model_name_or_path
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        # The training was carried out using two specific tokens for relevant and non-relevant passages
        self.token_false_id = self.tokenizer.get_vocab()['▁false']
        self.token_true_id  = self.tokenizer.get_vocab()['▁true']

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Loads the model with model_args
        model_args = {}
        if fp8:
            model_args["torch_dtype"] = torch.float16
            model_args["load_in_8bit"] = True

        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, **model_args).to(self.device)

    @torch.no_grad()
    def rescore(self, query: str, batch: List[str], batch_size: int = 32):
        """
        Adapted from Pygaggle's repo with added batch processing.
        Rescore all documents for the given query using smaller batches to save CUDA memory.

        Args:
            query: the query for ranking
            batch: list of passages for ranking
            batch_size: maximum size of each sub-batch to be processed

        Returns:
            List of scores for each document in the batch.
        """
        scores = []
        # Process the batch in chunks
        for i in range(0, len(batch), batch_size):
            sub_batch = batch[i:i + batch_size]
            queries_documents = [f"Query: {query} Document: {doc} Relevant:" for doc in sub_batch]
            tokenized = self.tokenizer(
                queries_documents,
                padding=True,
                truncation="longest_first",
                return_tensors="pt",
                max_length=512,
            ).to(self.device)

            input_ids = tokenized["input_ids"]
            attention_mask = tokenized["attention_mask"]

            # Assuming `greedy_decode` is a method that handles the model prediction
            _, batch_scores = self.greedy_decode(
                model=self.model,
                input_ids=input_ids,
                length=1,
                attention_mask=attention_mask,
                return_last_logits=True
            )

            # Extract logits for the relevant class
            batch_scores = batch_scores[:, [self.token_false_id, self.token_true_id]]
            batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
            batch_log_probs = batch_scores[:, 1].tolist()

            # Convert log probabilities to probabilities
            batch_probs = [torch.exp(torch.tensor(log_prob)).item() for log_prob in batch_log_probs]
            scores.extend(batch_probs)

        return scores

    @torch.no_grad()
    def greedy_decode(
        self,
        model,
        input_ids: torch.Tensor,
        length: int,
        attention_mask: torch.Tensor = None,
        return_last_logits: bool = True
    ):
        """
        Adapted from Pygaggle's repo.
        Performs the greedy_decode on t5's output.
        """
        decode_ids = torch.full((input_ids.size(0), 1),
                                model.config.decoder_start_token_id,
                                dtype=torch.long).to(input_ids.device)
        encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask)
        next_token_logits = None
        for _ in range(length):
            model_inputs = model.prepare_inputs_for_generation(
                decode_ids,
                encoder_outputs=encoder_outputs,
                past=None,
                attention_mask=attention_mask,
                use_cache=True)
            outputs = model(**model_inputs)  # (batch_size, cur_len, vocab_size)
            next_token_logits = outputs[0][:, -1, :]  # (batch_size, vocab_size)
            decode_ids = torch.cat([decode_ids,
                                    next_token_logits.max(1)[1].unsqueeze(-1)],
                                dim=-1)
        if return_last_logits:
            return decode_ids, next_token_logits
        return decode_ids

In [53]:
def get_searcher(model: str):
    if model == "monoT5_large":
        searcher = T5Ranker('castorini/monot5-large-msmarco-10k')
    elif model == "monoT5_base":
        searcher = T5Ranker('castorini/monot5-base-msmarco')
    elif model == "monoT5_small":
        searcher = T5Ranker('castorini/monot5-small-msmarco-10k')
    elif model == "inranker_base":
        searcher = T5Ranker('unicamp-dl/InRanker-base')
    elif model == "inranker_small":
        searcher = T5Ranker('unicamp-dl/InRanker-small')
    else:
        raise ValueError(f"Invalid model name: {model}")
    return searcher

## BM25 Search

In [54]:
from pyserini.search.lucene import LuceneSearcher

# Performs the BM25 search (initial retrieval)
bm25_searcher = LuceneSearcher('/content/iirc')

def get_passages(query, top_k):
    hits = searcher.search(query, top_k)
    return hits

## ReAct

In [55]:
system_prompt = """
You are designed to help with a variety of tasks, from answering questions
to providing summaries to other types of analyses.

## Tools
You have access to a wide variety of tools. You are responsible for using
the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools
to complete each subtask.

You have access to the following tools:
search_system: this is a useful tool you must use when looking to answer questions.

## Output Format
To answer the question, please use the following format.

```
Thought: I need to use a tool to help me answer the question.
Action: tool name (one of {tool_names}) if using a tool.
Action Input: the input to the search tool (the question to be searched)
```

Please ALWAYS start with a Thought.

If this format is used, the user will respond in the following format:

```
Observation: tool response
```

You should keep repeating the above format until you have enough information
to answer the question without using any more tools. At that point, you MUST respond
in the one of the following two formats:

```
Thought: I can answer without using any more tools.
Answer: [your answer here]
```

 Here I present you with an example of conversation of a human and your messages:
User: Is it legal for a licensed child driving Mercedes-Benz to be employed in US?

Assistant:
Thought: I need to use my search_system to look for two answers: "What is the minimum driving age in the US?"

Action Input: What is the minimum driving age in the US?

Observation: the minimum age for driving is 18 years old.

Thought: I have the answer for one of the question, therefore I don't need to rephrase it. Let's look to the other one:

Action: search_system

Action Input: What is the minimum age for someone to be employed in the US?

Observation: the minimum age to have a job is 18 years.

Thought: I have both answers, let's answer the original question:

Action: answer

Answer: Yes, it is possible since they occur at the same age.

## Current Conversation
Below is the current conversation consisting of interleaving human and assistant messages.
Please respond with just one action per time: (thought, action, action input, answer)
""".strip()

In [56]:
search_prompt = r"""
Given a list of documents, try to answer a question exclusively based on the documents.
You CANNOT use your own memory, you should use the available tools, You should always use different questions each time
Query: {query}
Documents:
{documents}
Answer:
""".strip()

In [58]:
import re

def get_search_answer(
    query: str,
    searcher,
    use_openai=True,
    top_k_bm25: int = 50,
    top_k_ranker: int = 1,
) -> tuple[str, float]:
    """
    Returns the answer to a question
    Args:
        query (str): The question to answer
        searcher (Searcher): The searcher to use
        use_openai (bool): Whether to use OpenAI or Llama3-70b
        top_k_bm25 (int): The number of passages to retrieve from BM25
        top_k_ranker (int): The number of passages to retrieve from the ranker
    Returns:
        tuple[str, float]: The answer and the cost of the query
    """
    hits = bm25_searcher.search(query, top_k_bm25)
    paragraphs = [json.loads(hit.raw) for hit in hits]
    if searcher is not None:
        scores = searcher.rescore(query, [p["contents"] for p in paragraphs])
        scores_paragraphs = list(zip(paragraphs, scores))
        sorted_paragraphs = sorted(scores_paragraphs, key=lambda x: x[1], reverse=True)
    else:
        sorted_paragraphs = [(p, None) for p in paragraphs]
    documents_string = ""
    for idx, (doc, _) in enumerate(sorted_paragraphs[:top_k_ranker]):
        documents_string += f"Document {idx} - {doc['title']}:\n{doc['contents']}\n\n"
    documents_string = documents_string.replace("{", "{{").replace("}", "}}")
    prompt = search_prompt.format(query=query, documents=documents_string)
    answer, cost = get_llm_response(prompt, use_openai=True)
    return answer, cost

def parse_llm_answer(input_string):
    result = {}
    tags = ["thought", "action input", "answer", "action"]
    for tag in tags:
        pattern = rf'{tag}: (.*?)(?=\n|$)'
        match = re.search(pattern, input_string, re.DOTALL)
        if match:
            result[tag] = match.group(1).strip()
    if "answer" in result:
        return {"answer": result["answer"]}
    elif "action input" in result:
        return {"action_input": result["action input"]}
    else:
        print("NONE:", input_string)
        return None

def get_react_answer(query, searcher, use_openai=True, max_react_attempts: int = 7):
    """
    Returns the answer to a question using a ReAct agent.
    Args:
        query (str): The question to answer
        use_openai (bool): Whether to use OpenAI or Llama3-70b
        max_react_attempts (int): The maximum number of attempts to answer the question
    Returns:
        tuple[str, float]: The answer and the cost of the query
    """
    react_attempts = 0
    total_cost = 0
    search_logs = []
    prompt = "Query: " + query
    while react_attempts < max_react_attempts:
        llm_answer, cost = get_llm_response(prompt, system_prompt, use_openai=True)
        total_cost += cost
        parsed_answer = parse_llm_answer(llm_answer)
        if parsed_answer is not None:
            key, value = list(parsed_answer.items())[0]
            if key == "answer":
                return value, total_cost, search_logs
            elif key == "action_input":
                # if type(value) != dict:
                #     prompt += f"Thought: {value}\nAnswer:"
                #     continue
                search_answer, cost = get_search_answer(value, use_openai=True, searcher=searcher)
                search_logs.append({
                    "query": value,
                    "context": search_answer
                })
                total_cost += cost
                prompt += f"Observation: {search_answer}"
            else:
                prompt += f"Thought:"
        react_attempts += 1
        if react_attempts == max_react_attempts:
            prompt += "thought: I MUST answer the question with the informations I have so far.\n answer:"
            llm_answer, cost = get_llm_response(prompt, system_prompt, use_openai=True)
            total_cost += cost
            parsed_answer = parse_llm_answer(llm_answer)
            if parsed_answer is not None:
                print("MAX REACHED", parsed_answer)
                return parsed_answer.get("answer"), total_cost, search_logs
            return parsed_answer, total_cost, search_logs
    return None, total_cost, search_logs

## Evaluation + metrics

In [59]:
from tqdm.auto import tqdm

# evaluation_dataset: (tuple) => (query, gold_answer, text)
searcher_list = [
    #"bm25",
    #"monoT5_small",
    #"inranker_small",
    #"monoT5_base",
    #"inranker_base",
    "monoT5_large",
]

total_cost = 0
for searcher_name in searcher_list:
    if searcher_name == "bm25":
        searcher = None
    else:
        searcher = get_searcher(searcher_name)
    print(f"Using {searcher_name}..")
    scores = []
    with tqdm(evaluation_dataset[:50], desc="Processing") as pbar:
        with open(f"output_{searcher_name}.jsonl", "w", encoding="utf8") as fout:
            for query, gold, _ in pbar:
                pred, cost, search_logs = get_react_answer(query, searcher, use_openai=True)
                total_cost += cost
                pbar.set_postfix({"Total Cost": total_cost})
                print(f"Query: {query}")
                print(f"Gold: {gold}")
                print(f"Pred: {pred}")
                data = {
                    "query": query,
                    "gold_answer": gold,
                    "pred_answer": pred,
                    "rounds": len(search_logs),
                    "search_logs": search_logs
                }
                fout.write(json.dumps(data) + "\n")

print(f"Processing total cost: ${total_cost}")

Using monoT5_large..


Processing:   0%|          | 0/50 [00:00<?, ?it/s]

Query: What is Zeus know for in Greek mythology?
Gold: sky and thunder god
Pred: zeus is known for being the sky and thunder god in greek mythology, ruling as the king of the gods on mount olympus.
NONE: thought: i need to calculate the time difference between the end of the first world war and when messe was named aide-de-camp in 1923.

action: calculate the time difference
Query: How long had the First World War been over when Messe was named aide-de-camp?
Gold: 5 years
Pred: giovanni messe became aide-de-camp to king victor emmanuel iii in 1923, which was 5 years after the end of the first world war in 1918.
Query: How old was Messe when the First World War started?
Gold: 30 years
Pred: messe was around 25 years old when the first world war started in 1914.
Query: How long had Angela Scoular been acting professionally when she appeared in the movie "On Her Majesty's Secret Service"?
Gold: 2 years
Pred: angela scoular had been acting professionally for 2 years when she appeared in th

# Evaluation

In [60]:
pred_eval_prompt = """
Given a question, the correct answer and the predicted answer, verify whether the predicted answer is correct.
question: {question}
correct_answer: {correct_answer}
predicted_answer: {predicted_answer}
Answer (true/false):
"""

system_context_relevance = """
You are a helpful assistant. You will be presented with a text and a question. Your role is
to extract a Python list with the sentences that should be used to answer the question.
"""

prompt_context_relevance = """
Text: {{text}}
Question: {{question}}
Sentences (answer with just a Python list):
""".strip()

In [61]:
import spacy

def get_pred_score(question, correct_answer, predicted_answer):
    response, _ = get_llm_response(pred_eval_prompt.format(question=question, correct_answer=correct_answer, predicted_answer=predicted_answer))
    if response.strip().lower() not in ["true", "false"]:
        return False
    return response.strip().lower() == "true"

def break_text_into_sentences(text: str):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

def get_context_relevance(context, question):
    num_sents = len(break_text_into_sentences(context))
    prompt = prompt_context_relevance.replace("{{text}}", context).replace("{{question}}", question)
    response, _ = get_llm_response(prompt, system_context_relevance, use_openai=True)
    try:
        sents = eval(response)
    except:
        try:
            new_prompt = f"Answer with only the sentences from the text: {response}"
            response, _ = get_llm_response(new_prompt, use_openai=True)
            print(response)
            sents = eval(response)
        except Exception as e:
            sents = []
            print("error getting context relevance", e)
    return len(sents)/num_sents

In [62]:
searcher_list = [
    #"bm25",
    #"monoT5_small",
    #"inranker_small",
    #"monoT5_base",
    #"inranker_base",
    "monoT5_large",
]

metrics = {}
for searcher_name in searcher_list:
    with open(f"output_{searcher_name}.jsonl", "r") as fin:
        for line in tqdm(fin, desc="evaluating"):
            data = json.loads(line)
            query = data["query"]
            gold = data["gold_answer"]
            pred = data["pred_answer"]
            for item in data["search_logs"]:
                if not metrics.get(query):
                    metrics[query] = {}
                if metrics[query].get("context_relevance"):
                    metrics[query]["context_relevance"] += get_context_relevance(item["context"], query)/len(data["search_logs"])
                else:
                    metrics[query]["context_relevance"] = get_context_relevance(item["context"], query)/len(data["search_logs"])
            if not metrics.get(query):
                metrics[query] = {}
            metrics[query]["correct"] = get_pred_score(query, gold, pred)
            metrics[query]["num_rounds"] = len(data["search_logs"])
    # Calculate average values
    num_rounds = 0
    correct = 0
    total = 0
    total_context_relevance = 0
    context_relevance = 0
    for value in metrics.values():
        num_rounds += int(value["num_rounds"])
        correct += int(value["correct"])
        total += 1
        if value.get("context_relevance"):
            total_context_relevance += 1
            context_relevance += float(value["context_relevance"])
    print("RESULTS", searcher_name)
    print("Average number of rounds:", num_rounds/total)
    print("Average correct:", correct/total)
    print("Average context relevance:", context_relevance/total_context_relevance)
    print()

evaluating: 0it [00:00, ?it/s]

"brunt returned to action after a torn acl in front of 26,688 at the hawthorns."
["donald jerome branby was selected as a first-team player on the 1952 college football all-america teams by the associated press."]
["suh yun-bok won the boston marathon in 1947."]
RESULTS monoT5_large
Average number of rounds: 2.18
Average correct: 0.54
Average context relevance: 1.8284742468415935

