## Exercicío 10

Implementar um pipeline multidoc QA: 

* Dado uma pergunta do usuário, buscamos em uma grande coleção as passagens mais relevantes e as enviamos para um sistema agregador, que irá gerar uma resposta final.
* Avaliar no dataset do IIRC
* Métrica principal: F1
* Limitar dataset de teste para 50 exemplos para economizar.
* Usar o gpt-3.5-turbo como modelo agregador. Usar vicuna-13B como alternativa open-source:
    * https://huggingface.co/helloollel/vicuna-13b 
    * https://chat.lmsys.org/ 
* Dicas:
    * Se inspirar no pipeline do Visconde: https://github.com/neuralmind-ai/visconde 

In [20]:
import json
from openai import InvalidRequestError
from pyserini.search import LuceneSearcher
from sentence_transformers import CrossEncoder
from tqdm.auto import tqdm
import re
import openai
import time
import json
import argparse
import collections
from collections import Counter
import numpy as np
import os
import string
import sys
import unicodedata

In [3]:
base_path = "."

### Indexing BM25

In [None]:
test_set = json.load(open(f"{base_path}/data/iirc_test.json", "r"))
context_articles = json.load(open(f"{base_path}/data/context_articles.json", "r"))

In [None]:
documents = []
all_titles = []

for item in tqdm(test_set):
    if item['title'].lower() not in all_titles:
        documents.append({
                "title": item['title'],
                "content": item["text"]
            }
        )
        all_titles.append(item['title'].lower())
        
    for link in item["links"]:
        if link['target'].lower() in context_articles and link['target'].lower() not in all_titles:
            documents.append({
                "title": link['target'],
                "content": context_articles[link['target'].lower()]
            })
            all_titles.append(link['target'].lower())

In [None]:
nlp = spacy.blank("en")
nlp.create_pipe("sentencizer")
nlp.add_pipe("sentencizer")

stride = 2
max_length = 3

def window(documents, stride=2, max_length=3):
    treated_documents = []

    for j,document in enumerate(tqdm(documents)):
        doc_text = document['content']
        doc = nlp(doc_text[:10000])
        sentences = [sent.text.strip() for sent in doc.sents]
        for i in range(0, len(sentences), stride):
            segment = ' '.join(sentences[i:i + max_length])
            treated_documents.append({
                "title": document['title'],
                "contents": document['title']+". "+segment,
                "segment": segment
            })
            if i + max_length >= len(sentences):
                break
    return treated_documents

treated_documents = window(documents)

In [None]:
f = open(f"{base_path}/data/iirc_indices/contents.jsonl",'w')

for i, doc in enumerate(treated_documents):
    doc['id'] = i
    if doc['segment'] != "":
        f.write(json.dumps(doc)+"\n")

In [None]:
!python3 -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator -threads 1 -input data/iirc_indices -index data/iirc_index -storeRaw

### Generate Evidences

In [4]:
test_set = json.load(open(f"{base_path}/data/samples_test.json"))

In [5]:
searcher = LuceneSearcher(f"{base_path}/data/iirc_index")
model_id = "cross-encoder/ms-marco-MiniLM-L-6-v2"
model = CrossEncoder(model_id, max_length=512, num_labels=1)

In [6]:
def generate_test_samples():
    test = json.load(open(f"{base_path}/data/iirc_test.json"))
    random_list = random.sample(range(0, len(test)), 50)
    samples = []
    all_q = []

    for item in test:
        for q in item['questions']:
            q['text'] = item['text']
            q['title'] = item['title']
            q['links'] = item['links']
            all_q.append(q)
        
    for i in random_list:
        samples.append(all_q[i])
    
    with open(f"{base_path}/data/samples_test.json", "w", encoding="utf-8") as f:
        json.dump(samples, f, ensure_ascii=False, indent=4)
        
    return samples

In [7]:
def search_with_bm25(query, k=1000):
    return searcher.search(query, k)

In [8]:
def reranking_cross_encoder(docs, max=1000, batch_size=500):
    for i in tqdm(range(0, len(docs), batch_size), leave=False):  # tqdm(docs):
        i_end = i + batch_size
        i_end = len(docs) if i_end > len(docs) else i_end

        batch = docs[i:i_end]

        text_pairs = [(sample['question'], sample["title"] + " " + sample['text']) for sample in batch]
        predictions = model.predict(text_pairs)

        results = []

        for score, result in zip(predictions, batch):
            results.append((result, score))

    sorted_list = sorted(results, key=lambda x: x[1], reverse=True)

    return sorted_list[:max]

In [13]:
def generate_answer(item):
    temperature = 0
    attempts = 1
    pattern = "(?<=Answer:)(.*)$"

    item['responses'] = []
    item['completions'] = []
    for i in range(attempts):
        try:
            res = generate(item['system_prompt'], item['prompts'], temperature=temperature)
        except InvalidRequestError:
            # Reduces the number of prompts by removing the largest one
            print("Current number of prompts = ", len(item['prompts']))
            max_prompt = item['prompts'][0]
            max_len = len(item['prompts'][0])
            for i, prompt in enumerate(item['prompts']):
                if i != 0 and len(prompt) > max_len:
                    max_len = len(prompt)
                    max_prompt = prompt
            item['prompts'].remove(max_prompt)
            # Try again
            res = generate(item['system_prompt'], item['prompts'], temperature=temperature)

        if "Answer" not in res:
            item['results'] = res
            item['responses'].append(res)
            item['completions'].append(res)
            item["asked_twice"] = True
        else:
            matches = re.findall(pattern, res)
            if len(matches) > 0:
                response = matches[0]
                item['responses'].append(response)
            item['results'] = res
            item['completions'].append(res)
            item["asked_twice"] = False

    return item

In [10]:
def run_pipeline():
    test = []
    for q in tqdm(test_set):
        item_passage = ""
        for i, c in enumerate(q['context']):
            item_passage += "Document {0}: {1}\n\n".format(i + 1, c['text'])
        item_passage += "{0}".format(q['question'])

        # Fisrt stage
        hits = search_with_bm25(item_passage, 100)
        docs = []

        for hit in hits:
            hit = json.loads(hit.raw)
            doc = {
                "passage_id": hit["id"],
                "question": q["question"],
                "text": hit["contents"],
                "context": q["context"],
                "title": hit["title"],
                "text": hit["contents"],
                "answer": q["answer"]
            }
            docs.append(doc)

        # Second stage
        docs_reranking = reranking_cross_encoder(docs, 4, 100)

        system_prompt = "For each example, use the documents to create an \"Answer\" and an \"Evidence\" to the \"Question\". Answer \"not enough information\" when not enough information is provided in the documents.\n\n"
        prompts = []
        for i, hit_score in enumerate(docs_reranking):
            hit = hit_score[0]
            prompt = "Example {0}:\n\n".format(i + 1)
            for j, c in enumerate(hit['context']):
                if c['passage'] == "main":
                    text = "Title: {0}. Content: {1}".format(hit['title'], c['text'])
                else:
                    text = "Title: {0}. Content: {1}".format(c['passage'], c['text'])
                prompt += "Document {0}: {1}\n\n".format(j + 1, text)
            answer = hit['answer']

            prompt += "Question: Based on the above documents, {0}\n\nEvidence: {1}\n\nAnswer: {2}.\n\n".format(
                hit['question'], 'Not found.', answer)

            prompts.append(prompt)

        prompt = "Example {0}:\n\n".format(i + 2)

        text = "Title: {0}. Content: {1}".format(q['title'], q['text'])
        prompt += "Document {0}: {1}\n\n".format(i + 1, text)

        prompt += "Question: Based on the above documents, {0}\n\nEvidence:".format(q['question'])
        prompts.append(prompt)

        q['prompts'] = prompts
        q['system_prompt'] = system_prompt
        answers = []
        if q['answer']['type'] == "span":
            at = ", ".join([a['text'] for a in q['answer']["answer_spans"]])
            answers.append(at)
        elif q['answer']['type'] == "value":
            at = "{0} {1}".format(q['answer']['answer_value'], q['answer']['answer_unit'])
            answers.append(at)
        elif q['answer']['type'] == "binary":
            answers.append(q['answer']['answer_value'])
        elif q['answer']['type'] == "none":
            answers.append("Not enough information")
        q['clean_answers'] = answers

        q = generate_answer(q)

        test.append(q)

    json.dump(test, open(f"{base_path}/data/iirc.json", 'w'))

In [11]:
def generate(system_prompt, prompts, temperature=0):
    messages = [{"role": "system", "content": system_prompt}] + [{"role": "user", "content": prompt} for prompt in prompts]

    response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages = messages,
        temperature=temperature
    )

    return response["choices"][0]['message']['content']

In [17]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    def remove_accents(input_str):
        nfkd_form = unicodedata.normalize('NFKD', input_str)
        only_ascii = nfkd_form.encode('ASCII', 'ignore')
        return only_ascii.decode("utf-8")

    return white_space_fix(remove_articles(remove_punc(lower(remove_accents(s)))))

In [18]:
def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

In [19]:
def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

In [21]:
def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [26]:
def evaluate():
    test = json.load(open(f"{base_path}/data/iirc.json"))
    f1s = []
    ems = []

    for item in tqdm(test):
        normalised = [normalize_answer(a.replace('\n','')) for a in item['responses']]
        print(normalised)
        c = Counter(normalised)
        if len(c.most_common(1)) > 0:
            response = c.most_common(1)[0][0]
            if "Not enough information provided in the documents." == item['clean_answers'][0]:
                item['clean_answers'][0] = "Not enough information"
            f1 = compute_f1(item['clean_answers'][0], response)
        else:
            f1 = 0
            
        f1s.append(f1)
        ems.append(compute_exact(item['clean_answers'][0], response))

    print("F1:",np.mean(f1s))
    print("EM:",np.mean(ems))

In [14]:
run_pipeline()

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [27]:
evaluate()

  0%|          | 0/50 [00:00<?, ?it/s]

['answervalue yes type binary']
['type span answerspans text louis bonaparte passage he rose to prominence under frenchdominated kingdom of holland being appointed minister of justice in 1806 by louis bonaparte type answer start 87 end 103 text napoleon passage and to council of state and as head of court of appeals in hague in 1810 by napoleon type answer start 105 end 114']
['type span answerspans text frank mcphee princetonn bernie flowers purduen eddie bell pennsylvanian tom stolhandske texasn tom scott virginian joe collier northwesternn buck martin georgia techn steve mellinger kentuckyn ed luke michigan staten harry babcock georgian passage 1952 college football allamerica team type answer start 1185 end 1459']
['type span answerspans text liguria passage university of genoa type answer start 247 end 254']
['answervalue 4 type value answerunit years']
['answervalue not enough information type text']
['not enough information']
['type none']
['type span answerspans text 4th panche