# Aula 7 - Solução dos exercícios

Implementar a fase de indexação e buscas de um modelo esparso

- Usar este modelo SPLADE já treinado naver/splade_v2_distil (do distilbert) ou splade-cocondenser-selfdistil (do BERT-base 110M params). Mais informações sobre os modelos estão neste artigo: https://arxiv.org/pdf/2205.04733.pdf
- Não é necessário treinar o modelo
- Avaliar nDCG@10 no TREC-COVID e comparar resultados com o BM25 e buscador denso da semana passada
- A dificuldade do exercício está em implementar a função de busca e ranqueamento usada pelo SPLADE. A implementação do índice invertido é apenas um "dicionário python".
- Comparar seus resultados com a busca "original" do SPLADE.
Medir latencia (s/query)

In [None]:
!pip install transformers -q
!pip install datasets -q
!pip install trectools -q
!pip install evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.2/224.2 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
### Used only to run on Google Colab
from google.colab import drive
drive.mount('/content/gdrive')

# Change de path to your drive
base_path = "gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio7"

Mounted at /content/gdrive


In [None]:
from datasets import load_dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorWithPadding
from collections import defaultdict
from typing import List
from evaluate import load
import torch
from tqdm.auto import tqdm
import pickle

### SPLADE no TREC-COVID

In [None]:
model_id = "naver/splade-cocondenser-ensembledistil"
# model_id = "naver/splade_v2_distil"
max_length = 256 
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trec_eval = load("trec_eval")

Downloading builder script:   0%|          | 0.00/5.51k [00:00<?, ?B/s]

In [None]:
def preprocess(sample):
  full_text = sample['title'] + ' ' + sample['text']
  return {'complete_text': full_text, 'len': len(full_text)}

In [None]:
def load_datasets():
  passages_dataset = load_dataset("BeIR/trec-covid", "corpus")
  queries_dataset = load_dataset("BeIR/trec-covid", "queries")
  qrels_dataset = load_dataset("BeIR/trec-covid-qrels")

  passages_dataset = passages_dataset.map(lambda x: preprocess(x))

  return passages_dataset, queries_dataset, qrels_dataset

In [None]:
@torch.no_grad()
def get_embeddings(model, tokenizer, texts: List[str]):
  tokens = tokenizer(texts,
                     return_tensors='pt',
                     padding=True,
                     truncation=True)
  tokens = collator(tokens)
  output = model(**tokens.to(device))

  sparse_vecs = torch.max(
      torch.log(
          1 + torch.relu(output.logits)
      ) * tokens.attention_mask.unsqueeze(-1),
      dim=1
  )[0].squeeze()

  return sparse_vecs

In [None]:
def compress_sparse_embeddings(sparse_embeddings):
  output = []

  non_zeros = sparse_embeddings.nonzero()

  for i in range(0, sparse_embeddings.shape[0]):
    rows = non_zeros[:, 0] == i
    idxs = non_zeros[rows, 1]
    scores = sparse_embeddings[i, idxs]
    
    output.append(dict(zip(idxs.cpu().tolist(), scores.cpu().tolist())))

  return output

In [None]:
def generate_sparse_passage_embeddings():
  batch_size = 32
  inverted_index = {}

  # Kudos to Marcos Piau/Gustavo
  with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=True):
    for i in tqdm(range(0, len(passages_dataset['corpus']), batch_size)):
      i_end = i + batch_size
      i_end = len(passages_dataset['corpus']) if i_end > len(passages_dataset['corpus']) else i_end

      batch = passages_dataset['corpus'][i:i_end]
      ids = batch['_id'] 
      texts = batch['complete_text'] 

      sparse_embeddings = get_embeddings(model, tokenizer, texts)
      compressed_embeddings = compress_sparse_embeddings(sparse_embeddings)

      for compressed_embedding, id in zip(compressed_embeddings, ids):

        sparse_dict_tokens = {
            idx2token[idx]: weight for idx, weight in compressed_embedding.items()
        }

        for token, weight in sparse_dict_tokens.items():
          if token not in inverted_index:
            inverted_index[token] = []
          inverted_index[token].append((id, weight))

    with open(f'{base_path}/data/inverted_index.pickle', 'wb') as handle:
      pickle.dump(inverted_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return inverted_index

In [None]:
def generate_sparse_query_embeddings():
  queries_compressed = {}
  batch_size = 32

  with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=True):
    for i in tqdm(range(0, len(queries_dataset['queries']), batch_size)):
      i_end = i + batch_size
      i_end = len(queries_dataset['queries']) if i_end > len(queries_dataset['queries']) else i_end

      batch = queries_dataset['queries'][i:i_end]
      ids = batch['_id'] 
      texts = batch['text'] 

      query_sparse_embeddings = get_embeddings(model, tokenizer, texts)
      compressed_embeddings = compress_sparse_embeddings(query_sparse_embeddings)

      for compressed_embedding, id in zip(compressed_embeddings, ids):

        sparse_dict_tokens = {
            idx2token[idx]: weight for idx, weight in compressed_embedding.items()
        }

        queries_compressed[id] = sparse_dict_tokens

  return queries_compressed

In [None]:
def generate_scores():
  scores = {}

  for query_id in tqdm(queries_compressed):
    query_compressed = queries_compressed[query_id]
    for token, q_score in query_compressed.items():
      if token in inverted_index:
        if query_id not in scores:
          scores[query_id] = {}

        docs_ids = inverted_index[token]

        for doc_id, d_score in docs_ids:
          if doc_id not in scores[query_id]:
            scores[query_id][doc_id] = 0

          scores[query_id][doc_id] += q_score * d_score
  
  for query_id in scores:
    scores[query_id] = dict(sorted(scores[query_id].items(), key=lambda h: h[1], reverse=True))

  return scores

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id).to(device)

collator = DataCollatorWithPadding(tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
idx2token = {
    idx: token for token, idx in tokenizer.get_vocab().items()
}

In [None]:
passages_dataset, queries_dataset, qrels_dataset = load_datasets()

Downloading builder script:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading and preparing dataset trec-covid/corpus to /root/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599...


Downloading data:   0%|          | 0.00/73.5M [00:00<?, ?B/s]

Generating corpus split: 0 examples [00:00, ? examples/s]

Dataset trec-covid downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset trec-covid/queries to /root/.cache/huggingface/datasets/BeIR___trec-covid/queries/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599...


Downloading data:   0%|          | 0.00/4.70k [00:00<?, ?B/s]

Generating queries split: 0 examples [00:00, ? examples/s]

Dataset trec-covid downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___trec-covid/queries/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading and preparing dataset csv/BeIR--trec-covid-qrels to /root/.cache/huggingface/datasets/BeIR___csv/BeIR--trec-covid-qrels-1766e3af5b0b856a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/981k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___csv/BeIR--trec-covid-qrels-1766e3af5b0b856a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/171332 [00:00<?, ? examples/s]

In [None]:
inverted_index = generate_sparse_passage_embeddings()

  0%|          | 0/5355 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
queries_compressed = generate_sparse_query_embeddings()

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
scores = generate_scores()

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
qrels_format = defaultdict(list)

for query in qrels_dataset['test']:
  qrels_format['query'].append(query["query-id"])
  qrels_format['q0'].append("q0")
  qrels_format['docid'].append(str(query["corpus-id"]))
  qrels_format['rel'].append(query["score"])

In [None]:
run_format = defaultdict(list)

for query_id in tqdm(scores, desc="Query"):
  rank = 1
  docs = scores[query_id]
  
  for doc_id in docs:
    score = scores[query_id][doc_id]
    run_format['query'].append(query_id)
    run_format['q0'].append("q0")
    run_format['docid'].append(str(doc_id))
    run_format['rank'].append(rank)
    run_format['score'].append(score)
    run_format['system'].append("SPLADE")

    if rank == 1000:
      break

    rank += 1

Query:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
results = trec_eval.compute(predictions=[run_format], references=[qrels_format])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topX[label] = topX["rel"] > 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  perfect_ranking["rank"] = 1


In [None]:
results['NDCG@10']

0.727808488025902