In [None]:
!pip install beautifulsoup4 -q

In [None]:
!pip install pyserini==0.21.0 -q
!pip install faiss-cpu==1.7.2 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for nmslib (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install accelerate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install bitsandbytes -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

# IIRC

In [None]:
!wget https://iirc-dataset.s3.us-west-2.amazonaws.com/context_articles.tar.gz -q
!wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json -q
!tar -xf context_articles.tar.gz

## Data Preprocessing

In [None]:
import json

dev_data = json.load(open("/content/iirc_test.json", "r"))
dev_sample = dev_data[:150] # get 150 examples (we'll only use 50 later)
context_articles = json.load(open("/content/context_articles.json", "r"))

In [None]:
from bs4 import BeautifulSoup

# Adapted from Visconde: https://github.com/neuralmind-ai/visconde
documents = []
all_titles = []

# Gets content and title of passages
# Filters HTML tags using beautifulSoup
for item in dev_sample:
    if item['title'].lower() not in all_titles:
        documents.append({
                "title": BeautifulSoup(item['title'], 'html.parser').get_text().strip().lower(),
                "contents": BeautifulSoup(item["text"], 'html.parser').get_text().strip().lower()
            }
        )
        all_titles.append(BeautifulSoup(item["title"], 'html.parser').get_text().strip().lower())
    # Get content from related links
    for link in item["links"]:
        if link['target'].lower() in context_articles and link['target'].lower() not in all_titles:
            documents.append({
                "title": BeautifulSoup(link['target'], 'html.parser').get_text().strip().lower(),
                "contents": BeautifulSoup(context_articles[link['target'].lower()], 'html.parser').get_text().strip().lower()
            })
            all_titles.append(BeautifulSoup(link['target'], 'html.parser').get_text().strip().lower())

  "title": BeautifulSoup(link['target'], 'html.parser').get_text().strip().lower(),
  all_titles.append(BeautifulSoup(link['target'], 'html.parser').get_text().strip().lower())


In [None]:
def get_sliding_window(doc: dict, words_per_window: int, words_overlap: int) -> list:
    """
    Returns a list of texts with 'words_per_window' words overlapping 'words_overlap' words.
    Args:
        doc {title, contents}: the doc to be split
        words_per_window: number of words per window
        words_overlap: number of words to overlap
    Returns:
        A list of contents (string)
    """
    assert words_per_window > words_overlap, "words_per_window should be greater than words_overlap"
    words = doc["contents"].split(" ")
    windows = []
    for i in range(0, len(words), words_per_window - words_overlap):
        if i + words_per_window >= len(words):
            windows.append(" ".join(words[i:]))
            break
        window = " ".join(words[i:i+words_per_window])
        windows.append(window)
    return windows

In [None]:
from tqdm.auto import tqdm
import os
from uuid import uuid4

os.makedirs("iirc_docs", exist_ok=True)

# Creates a jsonl file with all contents to be indexed using the Pyserini's library
with open("iirc_docs/iirc_docs.jsonl", "w") as fout:
    count = 0
    for doc in tqdm(documents, desc="processing docs", total=len(documents)):
        for idx, window in enumerate(get_sliding_window(
            doc=doc,
            words_per_window=150,
            words_overlap=75
        )):
            doc_dict = {"id": count, "contents": window, "title": doc["title"]}
            count += 1
            fout.write(json.dumps(doc_dict) + "\n")

processing docs:   0%|          | 0/2164 [00:00<?, ?it/s]

## BM25 Index

In [None]:
# Creates the BM25 index with Pyserini
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input iirc_docs \
  --index iirc \
  --language en\
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors --storeRaw

2024-06-12 22:00:43,607 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2024-06-12 22:00:43,615 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2024-06-12 22:00:43,621 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: iirc_docs
2024-06-12 22:00:43,622 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2024-06-12 22:00:43,622 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2024-06-12 22:00:43,623 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 1
2024-06-12 22:00:43,624 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2024-06-12 22:00:43,624 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2024-06-12 22:00:43,625 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Keep stopword

## Questions and Gold data

In [None]:
import re

def clean_string(input_string):
    # Convert the string to lowercase
    lowercase_string = input_string.lower()
    # Remove multiple spaces with a single space
    cleaned_string = re.sub(r'\s+', ' ', lowercase_string)
    # Remove punctuation marks using regular expressions
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)

    return cleaned_string

def get_answer(answer):
    if answer["type"] == "span":
        return clean_string(answer["answer_spans"][0]["text"])
    elif answer["type"] == "value":
        return clean_string(f"{answer['answer_value']} {answer['answer_unit']}")
    elif answer["type"] == None:
        return "not enough information"
    elif answer["type"] == "binary":
        return clean_string(answer["answer_value"])

In [None]:
# Contains tuples with (question, gold_answer)
evaluation_dataset = []

for passage in dev_sample:
    for question in passage["questions"]:
        query = question["question"]
        answer = get_answer(question["answer"])
        if answer != "not enough information" and answer is not None:
            evaluation_dataset.append((query, answer, question["text"]))

evaluation_dataset = evaluation_dataset[:50] # First 50 examples with answer

KeyError: 'text'

In [None]:
with open("evaluation_dataset.json", "w") as fout:
    json.dump(evaluation_dataset, fout)

# Evaluation pipeline

## LLM

In [None]:
from google.colab import userdata

api_key = userdata.get("OPENAI_KEY")
groq_key = userdata.get("GROQ_KEY")

In [None]:
import requests
import time

def get_llm_response(prompt: str, system_prompt: str = None, use_openai: bool = False):
    """
    Send a prompt to ChatGPT and get its answer.
    Args:
        prompt (str): a string containing the prompt
        system_prompt (str): a string containing the system prompt
        use_openai (bool): whether to use openai or groq
    Returns:
        The answer and the request cost
    """
    for _ in range(15):
        try:
            data = {
                "model": "gpt-4-turbo" if use_openai else "llama3-70b-8192",
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0,
                "top_p": 1
            }
            if system_prompt:
                data["messages"].insert(0, {"role": "system", "content": system_prompt})
            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {api_key if use_openai else groq_key}"
            }
            if use_openai:
                response = requests.post(
                    "https://api.openai.com/v1/chat/completions",
                    json=data,
                    headers=headers
                )
            else:
                response = requests.post(
                    "https://api.groq.com/openai/v1/chat/completions",
                    json=data,
                    headers=headers
                )
            if not response.ok:
                time.sleep(3)
                continue
            response = response.json()
            if use_openai:
                cost = 0.5e-6 * response["usage"]["prompt_tokens"]
                cost += 1.5e-6 * response["usage"]["completion_tokens"]
            else:
                cost = 0
            return response["choices"][0]["message"]["content"].strip().lower(), cost
        except Exception as e:
            raise("Error processing llm request:", e)

## Reranker - MonoT5

In [None]:
from math import exp
from typing import List

import torch
from tqdm.auto import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BatchEncoding,
    AutoModelForSeq2SeqLM
)

class MonoT5():
    def __init__(self, model_name_or_path: str, fp8: bool = False):
        """
        Loads the T5 model from the given path.
        Args:
            model_name_or_path: path to the model
            fp8: whether the model should be loaded using FP8
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        # The training was carried out using two specific tokens for relevant and non-relevant passages
        self.token_false_id = self.tokenizer.get_vocab()['▁false']
        self.token_true_id  = self.tokenizer.get_vocab()['▁true']

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Loads the model with model_args
        model_args = {}
        if fp8:
            model_args["torch_dtype"] = torch.float16
            model_args["load_in_8bit"] = True

        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, **model_args)

    @torch.no_grad()
    def rescore(self, query: str, batch: List[str], batch_size: int = 32):
        """
        Adapted from Pygaggle's repo with added batch processing.
        Rescore all documents for the given query using smaller batches to save CUDA memory.

        Args:
            query: the query for ranking
            batch: list of passages for ranking
            batch_size: maximum size of each sub-batch to be processed

        Returns:
            List of scores for each document in the batch.
        """
        scores = []
        # Process the batch in chunks
        for i in range(0, len(batch), batch_size):
            sub_batch = batch[i:i + batch_size]
            queries_documents = [f"Query: {query} Document: {doc} Relevant:" for doc in sub_batch]
            tokenized = self.tokenizer(
                queries_documents,
                padding=True,
                truncation="longest_first",
                return_tensors="pt",
                max_length=512,
            ).to(self.device)

            input_ids = tokenized["input_ids"]
            attention_mask = tokenized["attention_mask"]

            # Assuming `greedy_decode` is a method that handles the model prediction
            _, batch_scores = self.greedy_decode(
                model=self.model,
                input_ids=input_ids,
                length=1,
                attention_mask=attention_mask,
                return_last_logits=True
            )

            # Extract logits for the relevant class
            batch_scores = batch_scores[:, [self.token_false_id, self.token_true_id]]
            batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
            batch_log_probs = batch_scores[:, 1].tolist()

            # Convert log probabilities to probabilities
            batch_probs = [torch.exp(torch.tensor(log_prob)).item() for log_prob in batch_log_probs]
            scores.extend(batch_probs)

        return scores

    @torch.no_grad()
    def greedy_decode(
        self,
        model,
        input_ids: torch.Tensor,
        length: int,
        attention_mask: torch.Tensor = None,
        return_last_logits: bool = True
    ):
        """
        Adapted from Pygaggle's repo.
        Performs the greedy_decode on t5's output.
        """
        decode_ids = torch.full((input_ids.size(0), 1),
                                model.config.decoder_start_token_id,
                                dtype=torch.long).to(input_ids.device)
        encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask)
        next_token_logits = None
        for _ in range(length):
            model_inputs = model.prepare_inputs_for_generation(
                decode_ids,
                encoder_outputs=encoder_outputs,
                past=None,
                attention_mask=attention_mask,
                use_cache=True)
            outputs = model(**model_inputs)  # (batch_size, cur_len, vocab_size)
            next_token_logits = outputs[0][:, -1, :]  # (batch_size, vocab_size)
            decode_ids = torch.cat([decode_ids,
                                    next_token_logits.max(1)[1].unsqueeze(-1)],
                                dim=-1)
        if return_last_logits:
            return decode_ids, next_token_logits
        return decode_ids

In [None]:
t5_ranker = MonoT5('castorini/monot5-large-msmarco-10k', fp8=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

## BM25 Search

In [None]:
from pyserini.search.lucene import LuceneSearcher

# Performs the BM25 search (initial retrieval)
searcher = LuceneSearcher('/content/iirc')

def get_passages(query, top_k):
    hits = searcher.search(query, top_k)
    return hits

## ReAct

In [None]:
system_prompt = """
You are designed to help with a variety of tasks, from answering questions \
    to providing summaries to other types of analyses.

## Tools
You have access to a wide variety of tools. You are responsible for using
the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools
to complete each subtask.

You have access to the following tools:
search_system: this is a useful tool you must use when looking to answer questions.

## Output Format
To answer the question, please use the following format.

```
Thought: I need to use a tool to help me answer the question.
Action: tool name (one of {tool_names}) if using a tool.
Action Input: the input to the search tool (the question to be searched)
```

Please ALWAYS start with a Thought.

Please use a valid JSON format for the Action Input. Do NOT do this {{'input': 'hello world'}}.

If this format is used, the user will respond in the following format:

```
Observation: tool response
```

You should keep repeating the above format until you have enough information
to answer the question without using any more tools. At that point, you MUST respond
in the one of the following two formats:

```
Thought: I can answer without using any more tools.
Answer: [your answer here]
```

```
Thought: I cannot answer the question with the provided tools.
Answer: Sorry, I cannot answer your query.
```
 Here I present you with an example of conversation of a human and your messages:
User: Is it legal for a licensed child driving Mercedes-Benz to be employed in US?

Assistant:
Thought: I need to use my search_system to look for two answers: "What is the minimum driving age in the US?"

Action Input: {"query": "What is the minimum driving age in the US?"}

Observation: the minimum age for driving is 18 years old.

Thought: I have the answer for one of the question, therefore I don't need to rephrase it. Let's look to the other one:

Action: search_system

Action Input: {"query": "What is the minimum age for someone to be employed in the US?"}

Observation: the minimum age to have a job is 18 years.

Thought: I have both answers, let's answer the original question:

Action: answer

Answer: Yes, it is possible since they occur at the same age.

## Current Conversation
Below is the current conversation consisting of interleaving human and assistant messages.
""".strip()

In [None]:
search_prompt = r"""
Given a list of documents, try to answer a question exclusively based on the documents.
Query: {query}
Documents:
{documents}
Answer:
""".strip()

In [None]:
def get_search_answer(
    query: str,
    use_openai=False,
    top_k_bm25: int = 50,
    top_k_ranker: int = 5
) -> tuple[str, float]:
    """
    Returns the answer to a question
    Args:
        query (str): The question to answer
        use_openai (bool): Whether to use OpenAI or Llama3-70b
        top_k_bm25 (int): The number of passages to retrieve from BM25
        top_k_ranker (int): The number of passages to retrieve from the ranker
    Returns:
        tuple[str, float]: The answer and the cost of the query
    """
    hits = searcher.search(query, top_k_bm25)
    paragraphs = [json.loads(hit.raw) for hit in hits]
    scores = t5_ranker.rescore(query, [p["contents"] for p in paragraphs])
    scores_paragraphs = list(zip(paragraphs, scores))
    sorted_paragraphs = sorted(scores_paragraphs, key=lambda x: x[1], reverse=True)
    documents_string = ""
    for idx, (doc, _) in enumerate(sorted_paragraphs[:top_k_ranker]):
        documents_string += f"Document {idx} - {doc['title']}:\n{doc['contents']}\n\n"
    documents_string = documents_string.replace("{", "{{").replace("}", "}}")
    prompt = search_prompt.format(query=query, documents=documents_string)
    answer, cost = get_llm_response(prompt, use_openai=use_openai)
    return answer, cost

def parse_llm_answer(answer: str) -> dict:
    """
    Parses the answer from the LLM
    Args:
        answer (str): The answer from the LLM
    Returns:
        dict: The parsed answer with the corresponding key
    """
    answer_lower = answer.lower()
    keywords = {
        "action input": "action_input",
        "answer": "answer"
    }
    def _load_json(json_str: str):
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return json_str

    for key, label in keywords.items():
        if key in answer_lower:
            return {
                label: _load_json(answer.split(f'{key}:')[1].strip())
                if label in ["action_input"]
                else answer.split(f'{key}:')[1].strip()
            }
    return None

def get_react_answer(query, use_openai=False, max_react_attempts: int = 3):
    """
    Returns the answer to a question using a ReAct agent.
    Args:
        query (str): The question to answer
        use_openai (bool): Whether to use OpenAI or Llama3-70b
        max_react_attempts (int): The maximum number of attempts to answer the question
    Returns:
        tuple[str, float]: The answer and the cost of the query
    """
    react_attempts = 0
    total_cost = 0
    prompt = "Query: " + query
    while react_attempts < max_react_attempts:
        llm_answer, cost = get_llm_response(prompt, system_prompt, use_openai=use_openai)
        total_cost += cost
        parsed_answer = parse_llm_answer(llm_answer)
        print(parsed_answer)
        if parsed_answer is not None:
            key, value = list(parsed_answer.items())[0]
            if key == "answer":
                return value, total_cost
            elif key == "action_input":
                if type(value) != dict:
                    prompt += f"Thought: {value}\nAnswer:"
                    continue
                search_answer, cost = get_search_answer(value["query"], use_openai=use_openai)
                total_cost += cost
                prompt += f"Observation: {search_answer}"
                print(search_answer)
            else:
                prompt += f"Thought:"
        react_attempts += 1
        if react_attempts == max_react_attempts:
            prompt += "I MUST answer the question with the informations I have so far. If it is not enough say: I cannot answer your question with the provided information.\nAnswer:"
            llm_answer, cost = get_llm_response(prompt, system_prompt, use_openai=use_openai)
            total_cost += cost
            parsed_answer = parse_llm_answer(llm_answer)
            if parsed_answer is not None:
                return parsed_answer.get("answer"), total_cost
    return None, total_cost

## Evaluation + metrics

In [None]:
from tqdm.auto import tqdm

# evaluation_dataset: (tuple) => (query, gold_answer)
scores = []
total_cost = 0
with tqdm(evaluation_dataset, desc="Processing") as pbar:
    with open("output.jsonl", "w", encoding="utf8") as fout:
        for query, gold in pbar:
            pred, cost = get_react_answer(query, use_openai=True)
            print(f"Query: {query}")
            print(f"Gold: {gold}")
            print(f"Pred: {pred}")
            total_cost += cost
            score = get_score(gold, pred)
            em = int(str(pred).lower() == str(gold).lower())
            pbar.set_postfix({"Total Cost": total_cost})
            data = {
                "query": query,
                "gold": gold,
                "pred": pred,
                "score": score
            }
            fout.write(json.dumps(data) + "\n")

print(f"Processing total cost: ${total_cost}")

NameError: name 'evaluation_dataset' is not defined

# React Answers from Gold Documents - Baseline Values




In [None]:
from tqdm.auto import tqdm
import json
from bs4 import BeautifulSoup

#note: obtain_search_answer is changed from the usual implementation above for baseline values, now only using t5 ranker.
#obtain_search_answer keeps useless BM25 parameters so functions are easily interchangeable.

def extract_passages_from_links(links, context_articles, main_text):
    """
    Retrieves passages from the provided links and the main text.

    Args:
        links (list): List of links to documents.
        context_articles (dict): Dictionary containing context articles.
        main_text (str): The main text of the document.

    Returns:
        list: A list of dictionaries containing titles and contents of the documents.
    """
    linked_documents = []

    # Include the main text as a document
    if main_text:
        linked_documents.append({
            "title": "Main Document",
            "contents": BeautifulSoup(main_text, 'html.parser').get_text().strip().lower()
        })

    # Include linked articles from the links
    for link in links:
        link_lower = link.lower()
        if link_lower in context_articles:
            linked_doc = {
                "title": link,
                "contents": BeautifulSoup(context_articles[link_lower], 'html.parser').get_text().strip().lower()
            }
            linked_documents.append(linked_doc)

    return linked_documents

def identify_best_passage(query, linked_documents, top_k_ranker=5):
    """
    Identifies the best passage that answers the query from the linked documents.

    Args:
        query (str): The query to be answered.
        linked_documents (list): List of linked documents.
        top_k_ranker (int): Number of top passages to rank (default is 5).

    Returns:
        tuple: The best passage and its score.
    """
    if not linked_documents:
        return "", 0.0

    windows = []
    for doc in linked_documents:
        windows.extend(get_sliding_window(doc, words_per_window=150, words_overlap=75))

    scores = t5_ranker.rescore(query, windows)
    best_score_index = scores.index(max(scores))
    best_passage = windows[best_score_index]
    best_score = scores[best_score_index]

    return best_passage, best_score

def obtain_search_answer(query, linked_documents, use_openai=False, top_k_bm25=50, top_k_ranker=5):
    """
    Retrieves the answer to a question using a ranking-based search approach.

    Args:
        query (str): The question to be answered.
        linked_documents (list): List of linked documents retrieved.
        use_openai (bool): Flag to indicate usage of OpenAI (default is False).
        top_k_bm25 (int): Number of passages to retrieve using BM25 (not used in this version).
        top_k_ranker (int): Number of top passages to rank (default is 5).

    Returns:
        tuple: The best passage and its score.
    """
    if not linked_documents:
        return "", 0.0

    windows = []
    for doc in linked_documents:
        windows.extend(get_sliding_window(doc, words_per_window=150, words_overlap=75))

    scores = t5_ranker.rescore(query, windows)
    best_score_index = scores.index(max(scores))
    best_passage = windows[best_score_index]
    best_score = scores[best_score_index]

    return best_passage, best_score

def parse_llm_response(answer):
    """
    Parses the response from the language model.

    Args:
        answer (str): The response from the language model.

    Returns:
        dict: A dictionary containing the parsed answer.
    """
    answer_lower = answer.lower()
    keywords = {
        "action input": "action_input",
        "answer": "answer"
    }

    def _load_json(json_str):
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return json_str

    for key, label in keywords.items():
        if key in answer_lower:
            return {
                label: _load_json(answer.split(f'{key}:')[1].strip())
                if label == "action_input"
                else answer.split(f'{key}:')[1].strip()
            }
    return None

def retrieve_react_answer(query, linked_documents, use_openai=False, max_react_attempts=3):
    """
    Retrieves the answer to a question using a ReAct agent.

    Args:
        query (str): The question to be answered.
        use_openai (bool): Flag to indicate usage of OpenAI (default is False).
        max_react_attempts (int): Maximum number of attempts to answer the question.

    Returns:
        tuple: The answer, total cost of the query, and search log.
    """
    react_attempts = 0
    total_cost = 0
    prompt = "Query: " + query
    search_log = []

    while react_attempts < max_react_attempts:
        llm_answer, cost = get_llm_response(prompt, system_prompt, use_openai=False)
        total_cost += cost
        parsed_answer = parse_llm_response(llm_answer)

        if parsed_answer is not None:
            key, value = list(parsed_answer.items())[0]
            if key == "answer":
                return value, total_cost, react_attempts, search_log
            elif key == "action_input":
                if not isinstance(value, dict):
                    prompt += f"Thought: {value}\nAnswer:"
                    continue
                search_answer, cost = obtain_search_answer(value["query"], linked_documents)
                total_cost += cost
                prompt += f"Observation: {search_answer}"
                search_log.append({
                    "query": value["query"],
                    "context": search_answer
                })
        react_attempts += 1

        if react_attempts == max_react_attempts:
            prompt += ("I MUST answer the question with the information I have so far. "
                       "If it is not enough say: I cannot answer your question with the provided information.\nAnswer:")
            llm_answer, cost = get_llm_response(prompt, system_prompt, use_openai=False)
            total_cost += cost
            parsed_answer = parse_llm_response(llm_answer)
            if parsed_answer is not None:
                return parsed_answer.get("answer"), total_cost, react_attempts, search_log

    return None, total_cost, react_attempts, search_log

def generate_json_output(evaluation_dataset, context_articles, output_file="react_on_gold_50_Questions.json"):
    """
    Generates a JSON output file from the evaluation dataset and context articles.

    Args:
        evaluation_dataset (list): List of evaluation data items.
        context_articles (dict): Dictionary containing context articles.
        output_file (str): Path to the output file (default is "react_on_gold_50_Questions.json").
    """
    results = []
    with open(output_file, "w") as f:
        for item in tqdm(evaluation_dataset, desc="Constructing Answers"):
            query = item['question']
            gold_answer = item['answer']
            question_links = item.get('question_links', [])
            main_text = item.get('text', '')

            linked_documents = extract_passages_from_links(question_links, context_articles, main_text)
            best_passage, best_score = identify_best_passage(query, linked_documents)

            predicted_answer, cost, rounds, search_log = retrieve_react_answer(query, linked_documents)

            result = {
                "query": query,
                "gold_answer": gold_answer,
                "pred_answer": predicted_answer,
                "rounds": rounds,
                "search_log": search_log
            }
            results.append(result)
            json.dump(result, f)
            f.write('\n')

# Assuming dev_sample is loaded and structured correctly
evaluation_dataset = [
    {
        "question": q["question"],
        "answer": q["answer"],
        "question_links": q.get("question_links", []),
        "text": passage["text"]
    }
    for passage in dev_sample
    for q in passage["questions"]
    if get_answer(q["answer"]) != "not enough information" and get_answer(q["answer"]) is not None
]

evaluation_dataset = evaluation_dataset[:50]

# Call the function to create the JSON output
generate_json_output(evaluation_dataset, context_articles)


Constructing Answers:   0%|          | 0/50 [00:00<?, ?it/s]

MAIN TEXT!!!! The Palici (Παλικοί in Greek), or Palaci, were a pair of indigenous Sicilian chthonic deities in Roman mythology, and to a lesser extent in Greek mythology. They are mentioned in Ovid's Metamorphoses V, 406, and in Virgil's Aeneid IX, 585. Their cult centered on three small lakes that emitted sulphurous vapors in the Palagonia plain, and as a result these twin brothers were associated with geysers and the underworld. There was also a shrine to the Palaci in Palacia, where people could subject themselves or others to tests of reliability through divine judgement; passing meant that an oath could be trusted. The mythological lineage of the Palici is uncertain; one legend made the Palici the sons of Zeus, or possibly Hephaestus, by Aetna or Thalia, but another claimed that the Palici were the sons of the Sicilian deity Adranus.

Debug: Processing query: What is Zeus know for in Greek mythology?
Debug: Retrieving passages from links...
Debug: Retrieved document: {'title': 'Ze

# Evaluation

In [None]:
pred_eval_prompt = """
Given a question, the correct answer and the predicted answer, verify whether the predicted answer is correct.
question: {question}
correct_answer: {correct_answer}
predicted_answer: {predicted_answer}
Answer (true/false):
"""

system_context_relevance = """
You are a helpful assistant. You will be presented with a text and a question. Your role is
to extract a Python list with the sentences that should be used to answer the question.
"""

prompt_context_relevance = """
Text: {{text}}
Question: {{question}}
Sentences (answer with just a Python list):
""".strip()

In [None]:
import spacy

def get_pred_score(question, correct_answer, predicted_answer):
    response, _ = get_llm_response(pred_eval_prompt.format(question=question, correct_answer=correct_answer, predicted_answer=predicted_answer))
    assert response.strip().lower() in ["true", "false"]
    return response.strip().lower() == "true"

def break_text_into_sentences(text: str):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

def get_context_relevance(context, question):
    num_sents = len(break_text_into_sentences(context))
    prompt = prompt_context_relevance.replace("{{text}}", context).replace("{{question}}", question)
    response, _ = get_llm_response(prompt, system_context_relevance, use_openai=False)
    sents = eval(response)
    return len(sents)/num_sents

In [None]:
metrics = {}

In [None]:
with open("/content/react_on_gold_output_debug.jsonl", "r") as fin:
    for line in fin:
        data = json.loads(line)
        query = data["query"]
        gold = get_answer(data["gold_answer"])
        pred = data["predicted_answer"]
        for item in data["search_logs"]:
            if not metrics.get(query):
              metrics[query] = {}
            if metrics[query].get("context_relevance"):
                metrics[query]["context_relevance"] += get_context_relevance(item["context"], query)/len(data["search_logs"])
            else:
                metrics[query]["context_relevance"] = get_context_relevance(item["context"], query)/len(data["search_logs"])
        metrics[query]["correct"] = get_pred_score(query, gold, pred)
        metrics[query]["num_rounds"] = len(data["search_logs"])



In [None]:
# Calculate average values
num_rounds = 0
correct = 0
total = 0
context_relevance = 0
for value in metrics.values():
    num_rounds += int(value["num_rounds"])
    correct += int(value["correct"])
    total += 1
    context_relevance += float(value["context_relevance"])

print("Average number of rounds:", num_rounds/total)
print("Average correct:", correct/total)
print("Average context relevance:", context_relevance/total)

{'context_relevance': 0.5, 'correct': True, 'num_rounds': 2}
Average number of rounds: 2.0
Average correct: 1.0
Average context relevance: 0.5
