## Exercicío 9

Trade-offs de eficiência e qualidade

O objetivo do exercício desta semana é construir alguns pipelines de busca e analisá-los em termos das seguintes métricas:
* Qualidade dos resultados: nDCG@10;
* Latência (seg/query);
* USD por query assumindo utilização "perfeita": assim que terminou de processar uma query, já tem outra para ser processada;
* USD/mês para deixar o sistema rodando para poucos usuários (ex: 100 queries/dia);
Custo de indexação em USD;

Iremos avaliar os pipelines no TREC-COVID.
A latência precisa ser menor que 2 segundos por query.
Não assumir processamento de queries em batch.

Considerar:
* 1,50 USD/hora por A100 ou 0,21 USD/hora por T4 ou 0,50 USD/hora por V100
* 0,03 USD/hora por CPU core
* 0,005 USD/hora por GB de CPU RAM

Dicas:
* Utilizar modelos de busca "SOTA" já treinados no MS MARCO como parte do pipeline, como o SPLADE distil (esparso), contriever (denso), Colbert-v2 (denso), miniLM (reranker), monoT5-3B (reranker), doc2query minus-minus (expansão de documentos + filtragem com reranqueador na etapa de indexação)
* Pode usar API's como Cohere, OpenAI Embeddings

Variar parâmetros como número de documentos retornados em cada estágio. Por exemplo, BM25 retorna 1000 documentos, um modelo denso ou esparso pode franqueá-los, e passar os top 50 para o miniLM/monoT5 fazer um ranqueamento final.


In [1]:
!pip install transformers -q
!pip install datasets -q
!pip install pyserini -q
!pip install faiss-gpu -q
!pip install torch -q
!pip install sentence-transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import random
import pandas as pd
import pickle
import os
import json
from pyserini.search.lucene import LuceneSearcher
from time import time
from sentence_transformers import InputExample
from datasets import load_dataset
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from huggingface_hub import login
import collections

random.seed(12)

# login()

In [17]:
### Used only to run on Google Colab
from google.colab import drive
drive.mount('/content/gdrive')

# Change de path to your drive
base_path = "gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio9"

data = {}

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
def get_passages():
    pickle_file = f"{base_path}/data/passages.pickle"
    passages_dataset = load_dataset("BeIR/trec-covid", "corpus")

    if not os.path.isfile(pickle_file):
        passages = {}
        for item in tqdm(passages_dataset["corpus"]):
            passages[item["_id"]] = {
                "fulltext": item["title"] + " " + item["text"]
            }

        with open(pickle_file, "wb") as f:
            pickle.dump(passages, f)
    else:
        with open(pickle_file, "rb") as f:
            passages = pickle.load(f)

    return passages

In [5]:
def load_corpus_trec_covid():
    passages_dataset = load_dataset("BeIR/trec-covid", "corpus")

    df_corpus = pd.DataFrame()
    df_corpus["_id"] = passages_dataset['corpus']["_id"]
    df_corpus["title"] = passages_dataset['corpus']["title"]
    df_corpus["text"] = passages_dataset['corpus']["text"]

    return df_corpus

In [6]:
def save_qrels():
    trec_covid_qrels = load_dataset("BeIR/trec-covid-qrels")
    qrels = pd.DataFrame()
    qrels["query_id"] = trec_covid_qrels['test']["query-id"]
    qrels["corpus_id"] = trec_covid_qrels['test']["corpus-id"]
    qrels["score"] = trec_covid_qrels['test']["score"]

    with open(f"{base_path}/data/collections/qrels.tsv", "w") as file:
        for line in qrels.values.tolist():
            query_id = line[0]
            corpus_id = line[1]
            score = line[2]
            file.write(f'{query_id}\t{0}\t{corpus_id.strip()}\t{score}\n')

In [7]:
def ranking_bm25():
    start = time()

    output_json = f"{base_path}/data/collections/corpus.jsonl"

    with open(output_json,'w') as file:
        for index, row in tqdm(df_corpus.iterrows(), total=df_corpus.shape[0]):
            output_dict = {
                'id': row["_id"],
                'contents': row["title"] + row["text"]
            }

            file.write(json.dumps(output_dict) + '\n')

    
    !python -m pyserini.index.lucene --collection JsonCollection --input {base_path}/data/collections --index indexes/lucene-index-trec-covid --generator DefaultLuceneDocumentGenerator --threads 9 --storePositions --storeDocvectors --storeRaw

    end = time()
    elapsed_time = round(end - start, 3)
    data["bm25"] = {
        "indexing_time": elapsed_time
    }

In [8]:
def search_with_bm25(query, k=1000):
    start = time()
    docs_rel = {f.docid: f.score for f in searcher.search(query, k)}
    hits = collections.OrderedDict(docs_rel)
    end = time()
    elapsed_time = round(end - start, 3)

    if "queries_time" not in data["bm25"]:
        data["bm25"]["queries_time"] = 0
    
    data["bm25"]["queries_time"] += elapsed_time
    
    return hits

In [9]:
def reranking_cross_encoder(docs, max=1000, batch_size=500):
    scores = []

    query_ids = []
    passage_ids = []

    start = time()

    for i in tqdm(range(0, len(docs), batch_size), leave=False):  # tqdm(docs):
        i_end = i + batch_size
        i_end = len(docs) if i_end > len(docs) else i_end

        batch = docs[i:i_end]

        text_pairs = [(sample['query_text'], sample['passage_text']) for sample in batch]
        predictions = model.predict(text_pairs)

        for score, result in zip(predictions, batch):
            query_id = result["query_id"]
            query_ids.append(query_id)

            passage_id = result["passage_id"]
            passage_ids.append(passage_id)
            scores.append(score)

    results = []

    for query, passage, score in zip(query_ids, passage_ids, scores):
        results.append((query, passage, score))

    sorted_list = sorted(results, key=lambda x: x[2], reverse=True)
    end = time()
    elapsed_time = round(end - start, 3)

    if "cross_encoder" not in data:
        data["cross_encoder"] = {
            "reranking": 0
        }

    data["cross_encoder"]["reranking"] += elapsed_time

    return sorted_list[:max]

In [10]:
def run_pipeline():
    pickle_file = f"{base_path}/data/passages.pickle"
    with open(pickle_file, "rb") as f:
        passages = pickle.load(f)

    docs_reranking = []
    col_names = ["id", "text"]
    df_queries = pd.read_csv(f'{base_path}/data/topics.tsv', sep='\t', names=col_names)

    for index, row in tqdm(df_queries.iterrows(), total=df_queries.shape[0]):
        query_id = row["id"]
        query_text = row["text"]

        # Fisrt stage
        hits = search_with_bm25(query_text, 100)
        docs = []

        for doc_id in hits:
            doc = {
                "query_id": query_id,
                "passage_id": doc_id,
                "query_text": query_text,
                "passage_text": passages[doc_id]["fulltext"],
            }
            docs.append(doc)

        # Second stage
        docs_reranking.extend(reranking_cross_encoder(docs, 100, 100))

    return docs_reranking

In [11]:
df_corpus = load_corpus_trec_covid()

Downloading builder script:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading and preparing dataset trec-covid/corpus to /root/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599...


Downloading data:   0%|          | 0.00/73.5M [00:00<?, ?B/s]

Generating corpus split: 0 examples [00:00, ? examples/s]

Dataset trec-covid downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

## Pipeline Execution

* Indexing corpus with BM25
* Ranking QRELS with BM25
* Reranking with Cross Encoder

In [12]:
ranking_bm25()

  0%|          | 0/171332 [00:00<?, ?it/s]

2023-05-10 17:36:07,861 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-05-10 17:36:07,863 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-05-10 17:36:07,864 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio9//data/collections
2023-05-10 17:36:07,864 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-05-10 17:36:07,864 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-05-10 17:36:07,865 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 9
2023-05-10 17:36:07,865 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-05-10 17:36:07,865 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-05-10 17:36:07,866 INFO  [mai

In [13]:
searcher = LuceneSearcher("indexes/lucene-index-trec-covid")
model_id = "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_name = "ms-marco-MiniLM-L-6-v2"
model = CrossEncoder(model_id, max_length=512, num_labels=1)
# model_id = "sentence-transformers/msmarco-bert-base-dot-v5"
# model = SentenceTransformer(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [14]:
docs_reranking = run_pipeline()

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
print(json.dumps(data, indent=4))

{
    "bm25": {
        "indexing_time": 57.831,
        "queries_time": 1.1260000000000003
    },
    "cross_encoder": {
        "reranking": 15.955999999999994
    }
}


In [20]:
with open(f"{base_path}/runs/run.trec-covid.bm25.{model_name}_reranked_st_bl.trec", "w") as f:
    for i, (query_id, passage_id, score) in enumerate(docs_reranking):
        f.write(f'{query_id}\tQ0\t{passage_id}\t{i+1}\t{score}\t{model_name}_reranked\n')

In [28]:
!python -m pyserini.eval.trec_eval -c -m map -m recip_rank -m ndcg_cut.10 {base_path}/data/qrels.tsv {base_path}/runs/run.trec-covid.bm25.{model_name}_reranked_st_bl.trec

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'map', '-m', 'recip_rank', '-m', 'ndcg_cut.10', 'gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio9/data/qrels.tsv', 'gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio9/runs/run.trec-covid.bm25.ms-marco-MiniLM-L-6-v2_reranked_st_bl.trec']
Results:
map                   	all	0.0841
recip_rank            	all	0.8697
ndcg_cut_10           	all	0.7419
