<a href="https://colab.research.google.com/github/thiagolaitz/IA368-search-engines/blob/main/Project%2009/multidocqa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction - Multi-document QA

This Colab notebook presents a comprehensive approach to building a multi-document question answering (QA) pipeline and evaluating its performance on IIRC. The pipeline encompasses various natural language processing techniques, including document retrieval (using BM25 + Monot5 ranking), question decomposition, evidence reasoning and answer aggregation. By leveraging the power of deep learning models and state-of-the-art language understanding, this notebook aims to provide a practical and robust solution for extracting relevant information from multiple sources and generating concise answers to user queries in real-world scenarios.

In [1]:
!pip install beautifulsoup4 -q

In [2]:
!pip install pyserini -q
!pip install faiss-cpu==1.7.2 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m123.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.3 MB/s[0m e

In [None]:
!pip install accelerate

# IIRC

In [3]:
!wget https://iirc-dataset.s3.us-west-2.amazonaws.com/context_articles.tar.gz -q 
!wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json -q

In [4]:
!tar -xf context_articles.tar.gz

## Data Preprocessing

In [5]:
import json

dev_data = json.load(open("/content/iirc_test.json", "r"))
context_articles = json.load(open("/content/context_articles.json", "r"))

In [None]:
from bs4 import BeautifulSoup

# Adapted from Visconde: https://github.com/neuralmind-ai/visconde
documents = []
all_titles = []

# Gets content and title of passages
# Filters HTML tags using beautifulSoup
for item in dev_data:
    if item['title'].lower() not in all_titles:
        documents.append({
                "title": BeautifulSoup(item['title'], 'html.parser').get_text().strip().lower(),
                "contents": BeautifulSoup(item["text"], 'html.parser').get_text().strip().lower()
            }
        )
        all_titles.append(BeautifulSoup(item["title"], 'html.parser').get_text().strip().lower())
    # Get content from related links
    for link in item["links"]:
        if link['target'].lower() in context_articles and link['target'].lower() not in all_titles:
            documents.append({
                "title": BeautifulSoup(link['target'], 'html.parser').get_text().strip().lower(),
                "contents": BeautifulSoup(context_articles[link['target'].lower()], 'html.parser').get_text().strip().lower()
            })
            all_titles.append(BeautifulSoup(link['target'], 'html.parser').get_text().strip().lower())

In [7]:
import os

os.makedirs("iirc_docs", exist_ok=True)

# Creates a jsonl file with all contents to be indexed using the Pyserini's library
with open("iirc_docs/iirc_docs.jsonl", "w") as fout:
    for idx, doc in enumerate(documents):
        doc_dict = {"id": idx, **doc}
        fout.write(json.dumps(doc_dict) + "\n")

## BM25 Index

In [8]:
# Creates the BM25 index with Pyserini
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input iirc_docs \
  --index iirc \
  --language en\
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors --storeRaw 

2023-05-17 19:05:27,106 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-05-17 19:05:27,108 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-05-17 19:05:27,109 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: iirc_docs
2023-05-17 19:05:27,109 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-05-17 19:05:27,109 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-05-17 19:05:27,109 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 1
2023-05-17 19:05:27,110 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-05-17 19:05:27,110 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-05-17 19:05:27,110 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Keep stopword

## Questions and Gold data

In [79]:
import random

dev_sample = random.sample(dev_data, 50)

In [80]:
import re

def clean_string(input_string):
    # Convert the string to lowercase
    lowercase_string = input_string.lower()
    # Remove multiple spaces with a single space
    cleaned_string = re.sub(r'\s+', ' ', lowercase_string)
    # Remove punctuation marks using regular expressions
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)
    
    return cleaned_string

def get_answer(answer):
    if answer["type"] == "span":
        return clean_string(answer["answer_spans"][0]["text"])
    elif answer["type"] == "value":
        return clean_string(f"{answer['answer_value']} {answer['answer_unit']}")
    elif answer["type"] == None:
        return "not enough information"
    elif answer["type"] == "binary":
        return clean_string(answer["answer_value"])

In [81]:
# Contains tuples with (question, gold_answer)
evaluation_dataset = []

for passage in dev_sample:
    for question in passage["questions"]:
        query = question["question"]
        answer = get_answer(question["answer"])
        evaluation_dataset.append((query, answer))

# Selects only a subset of all dataset
evaluation_dataset = evaluation_dataset[:20]

# Evaluation pipeline

## OpenAI

In [12]:
!pip install openai -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
api_key = "YOUR_API_KEY"

In [41]:
import requests
import time

def get_llm_response(prompt):
    """
    Send a prompt to ChatGPT and get its answer.
    Args:
        prompt (str): a string containing the prompt
    Returns:
        The answer and the request cost
    """
    for _ in range(5):
        data = {
            "model": "gpt-3.5-turbo",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0,
            "top_p": 1
        }

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }

        response = requests.post(
            "https://api.openai.com/v1/chat/completions",
            json=data,
            headers=headers
        )
        if not response.ok:
            time.sleep(5)
            continue
        response = response.json()
        cost = 0.000002 * response["usage"]["total_tokens"]
        
        return response["choices"][0]["message"]["content"].strip().lower(), cost

## Reranker - MonoT5

In [15]:
from math import exp
from typing import List

import torch
from tqdm.auto import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BatchEncoding,
    AutoModelForSeq2SeqLM
)

class MonoT5():
    def __init__(self, model_name_or_path: str = 'castorini/monot5-base-msmarco-10k', fp16: bool = False):
        """
        Loads the T5 model from the given path.
        Args:
            model_name_or_path: path to the model
            fp16: whether the model should be loaded using FP16
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        # The training was carried out using two specific tokens for relevant and non-relevant passages
        self.token_false_id = self.tokenizer.get_vocab()['▁false']
        self.token_true_id  = self.tokenizer.get_vocab()['▁true']

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Loads the model with model_args
        model_args = {}
        if fp16:
            model_args["torch_dtype"] = torch.float16

        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, **model_args).to(self.device)

    @torch.no_grad()
    def rescore(self, query: str, batch: List[str]):
        """
        Adapted from Pygaggle's repo.
        Rescore all documents for the given query.
        Args:
            query: the query for ranking
            batch: list of passages for ranking
        """
        scores = []
        # Creates the inputs to the model
        queries_documents = [f"Query: {query} Document: {text} Relevant:" for text in batch]
        tokenized = self.tokenizer(
            queries_documents,
            padding=True,
            truncation="longest_first",
            return_tensors="pt",
            max_length=256,
        ).to(self.device)
        input_ids = tokenized["input_ids"].to(self.device)
        attention_mask = tokenized["attention_mask"].to(self.device)
        _ , batch_scores = self.greedy_decode(model=self.model,
                                            input_ids=input_ids,
                                            length=1,
                                            attention_mask=attention_mask,
                                            return_last_logits=True)
        batch_scores = batch_scores[:, [self.token_false_id, self.token_true_id]]
        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
        batch_log_probs = batch_scores[:, 1].tolist()
        batch_probs = [exp(log_prob) for log_prob in batch_log_probs]
        scores.extend(batch_probs)
        return scores

    @torch.no_grad()
    def greedy_decode(
        self,
        model,
        input_ids: torch.Tensor,
        length: int,
        attention_mask: torch.Tensor = None,
        return_last_logits: bool = True
    ):
        """
        Adapted from Pygaggle's repo.
        Performs the greedy_decode on t5's output.
        """
        decode_ids = torch.full((input_ids.size(0), 1),
                                model.config.decoder_start_token_id,
                                dtype=torch.long).to(input_ids.device)
        encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask)
        next_token_logits = None
        for _ in range(length):
            model_inputs = model.prepare_inputs_for_generation(
                decode_ids,
                encoder_outputs=encoder_outputs,
                past=None,
                attention_mask=attention_mask,
                use_cache=True)
            outputs = model(**model_inputs)  # (batch_size, cur_len, vocab_size)
            next_token_logits = outputs[0][:, -1, :]  # (batch_size, vocab_size)
            decode_ids = torch.cat([decode_ids,
                                    next_token_logits.max(1)[1].unsqueeze(-1)],
                                dim=-1)
        if return_last_logits:
            return decode_ids, next_token_logits
        return decode_ids

In [17]:
t5_3b = MonoT5('castorini/monot5-large-msmarco-10k', fp16=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

For stability purposes, it is recommended to have accelerate installed when using this model in torch.float16, please install it with `pip install accelerate`


## Query decomposition + search

The idea is to break the query into multiple sub-questions, allowing the aggregation step to effectively answer the original query.

In [16]:
import spacy

def break_text_into_sentences(text: str):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

In [18]:
import json

def decompose_query(query):
    # Prompt adapted from Visconde
    prompt = f"Decompose a question in sub-questions using a python list. Use \"The question needs no decomposition\" when no decomposition is needed.\n\nExample 1:\n\nQuestion: Is Hamlet more common on IMDB than Comedy of Errors?\n\nDecompositions: \n[\"How many listings of Hamlet are there on IMDB?\", \"How many listing of Comedy of Errors is there on IMDB?\"]\n\nExample 2:\n\nQuestion: Are birds important to badminton?\n\nDecompositions:\nThe question needs no decomposition\n\nExample 3:\n\nQuestion: Is it legal for a licensed child driving Mercedes-Benz to be employed in US?\n\nDecompositions:\n[\"What is the minimum driving age in the US?\", \"What is the minimum age for someone to be employed in the US?\"]\n\nExample 4:\n\nQuestion: Are all cucumbers the same texture?\n\nDecompositions:\nThe question needs no decomposition\n\nExample 5:\n\nQuestion: Hydrogen's atomic number squared exceeds number of Spice Girls?\n\nDecompositions:\n[\"What is the atomic number of hydrogen?\", \"How many Spice Girls are there?\"]\n\nExample 6:\n\nQuestion: {query}\n\nDecompositions:"
    answer, _ = get_llm_response(prompt)
    try:
        return json.loads(answer)
    except:
        return []

In [19]:
from pyserini.search.lucene import LuceneSearcher

# Performs the BM25 search (initial retrieval)
searcher = LuceneSearcher('/content/iirc')

def get_passages(query, top_k):
    hits = searcher.search(query, top_k)
    return hits

## Aggregation

The full pipeline consists of four steps: (1) Question decomposition, (2) Document retrieval for each sub-question, (3) QA for each sub-question, and (4) aggregation of answers.

In [107]:
def get_aggregated_answer(question):
    # [STEP 1] - get sub-questions
    decomposed_questions = decompose_query(question)
    if len(decomposed_questions) == 0:
        decomposed_questions = [question]

    aggregated_prompt = f"Provide an answer to the question \"{question}\" given the answers of the questions below:\n"
    for q_idx, query in enumerate(decomposed_questions):
        # [STEP 2] - document retrieval
        hits = get_passages(question, top_k=3)
        # [STEP 3] - QA on sub-questions - Adapted from Visconde
        prompt = f"For each example, use the documents to provide an answer to the question and cite evidence from the documents to support the answer. If there is not enough information in the documents to answer the question, then state \"not enough information\".\n\nExample 1:\n\nDocument 1: Title: Fred J. Shields. Content: Ollie Murphy\'s first-half goal gave \'the Royals\' a huge boost at half-time.\n\nDocument 2: Title: Ollie Murphy. Content: He plays club football for Carnaross\n\nDocument 3: Title: Ollie Murphy. Content: He came to national prominence in 1999 when he was one of Meath\'s best player\'s\n\nQuestion: Based on the above documents, Did Ollie Murphy play for any teams other than \'the Royals\'?\n\nEvidence: Document 1 says that Ollie Murphy\'s first-half goal gave \'the Royals\' a huge boost at half-time. However, this does not necessarily mean that Ollie only played for \'the Royals\'. Document 2 states that Ollie plays club football for Carnaross. This suggests that Ollie may have played for other teams in addition to \'the Royals\'. Document 3 says that Ollie came to national prominence in 1999 when he was one of Meath\'s best players. This also suggests that Ollie has played for other teams. Therefore, it is likely that Ollie has played for teams other than \'the Royals\'.\n\nAnswer: yes.\n\nExample 2:\n\nDocument 1: Title: Don Rendell. Content: The club played in the Brunei Premier League in the early 2000s, winning the league title in 2002 and 2004.\n\nQuestion: Based on the above documents, What club came in second at the 2004 Brunei Premier League?\n\nEvidence:  There is not enough information in the documents to answer the question.\n\nAnswer: not enough information.\n\nExample 3:\n\nDocument 1: Title: Stacy Compton. Content: Despicable Me, the first film in the series, and the first film from Illumination, was released on July 9, 2010.\n\nDocument 2: Title: Miranda Cosgrove. Content: Cosgrove\'s first television appearance (aside from commercials) was in 2001 as the voice of young Lana Lang in the pilot episode of Smallville.\n\nDocument 3: Title: Miranda Cosgrove. Content: In 2004, Cosgrove soon landed her first major role in a television series when she was awarded a main role in the Nickelodeon series Drake & Josh\n\nDocument 4: Title: Miranda Cosgrove. Content: Also in 2004, Cosgrove guest-starred in a special episode of the animated series What\'s New, Scooby-Doo?, as well as guest-starring in a season five episode of Grounded For Life\n\nDocument 5: Title: Miranda Cosgrove. Content: The television series, which aired on Disney, is a spin-off of the original film, Lilo & Stitch\n\nDocument 6: Title: Miranda Cosgrove. Content: The first of these appearances was in Zoey 101. Cosgrove later guest starred on an episode of Unfabulous,\n\nDocument 7: Title: Miranda Cosgrove. Content: However, Cosgrove was already in the works of starring in her own sitcom, titled iCarly, released on September 8, 2007.\n\nQuestion: Based on the above documents, How many TV shows had Miranda Cosgrove been featured in by the year Despicable Me was released?\n\nEvidence: According to document 1, Despicable Me was released on July 9, 2010.Document 2 states that Cosgrove\'s first television appearance was in 2001 as the voice of young Lana Lang in the pilot episode of Smallville.Document 3 says that, in 2004, Cosgrove landed her first major role in a television series when she was awarded a main role in the Nickelodeon series Drake & Josh.Document 4 states that, also in 2004, Cosgrove guest-starred in a special episode of the animated series What\'s New, Scooby-Doo?, as well as guest-starring in a season five episode of Grounded For Life.Document 5 says that the television series, which aired on Disney, is a spin-off of the original film, Lilo & Stitch.Document 6 states that the first of these appearances was in Zoey 101. Cosgrove later guest starred on an episode of Unfabulous.Document 7 says that, however, Cosgrove was already in the works of starring in her own sitcom, titled iCarly, released on September 8, 2007.Therefore, Miranda Cosgrove had been featured in 8 TV shows by the year Despicable Me was released.\n\nAnswer: 8 TV shows.\n\nExample 4:\n\n"
        for idx, hit in enumerate(hits):
            raw_content = json.loads(hit.raw)
            paragraphs = break_text_into_sentences(raw_content["contents"])[:5]
            # Rerank the sentences of documents, since they are too long
            scores = t5_3b.rescore(query, paragraphs)
            sorted_paragraphs = sorted(paragraphs, key=lambda x: scores[paragraphs.index(x)], reverse=True)
            content = '...'.join(sorted_paragraphs[:2])
            prompt += f"Document {idx}: Title: {raw_content['title']}. Content: {content}\n\n"
        prompt += f"Question: {query}\n\nEvidence:"
        # Uses chain-of-thought to improve results
        evidence, _ = get_llm_response(prompt)
        prompt += f"{evidence}\n\nAnswer:"
        query_answer, _ = get_llm_response(prompt)
        # The final output is constructed based on previous answers
        aggregated_prompt += f"Question {q_idx}: {query}\nAnswer {q_idx}: {query_answer}\n"
    
    aggregated_prompt += "If it is impossible to answer, you should write \"None\".\nAnswer:"
    final_answer, _ = get_llm_response(aggregated_prompt)
    return final_answer.strip()

## Evaluation + metrics

In [None]:
def calculate_f1_bow(pred, gold):
    """
    Computes the f1-bow (bag of words) for the results.
    This is done by comparing the f1-score for the tokens in the predicted answer
    """
    if pred.lower().rstrip(".") == "none" and gold is None:
        return 1
    elif gold is None and pred.lower().rstrip(".") != "none":
        return 0
    else:
        tokens_pred = pred.split(" ")
        tokens_gold = gold.split(" ")
        precision = sum([1 for token in tokens_pred if token in tokens_gold]) / len(tokens_pred)
        recall = sum([1 for token in tokens_gold if token in tokens_pred]) / len(tokens_gold)
        if precision + recall != 0:
            return 2 * ((precision * recall) / (precision + recall))
        else:
            return 0

In [None]:
# evaluation_dataset: (tuple) => (query, gold_answer)
f1_list = []
for query, gold in evaluation_dataset:
    pred = get_aggregated_answer(query)
    f1 = calculate_f1_bow(pred, gold)
    f1_list.append(f1)

In [113]:
print("F1-BOW score:", (sum(f1_list) / len(f1_list)))

F1-BOW score: 0.3064516129032258
