# Aula 6 - Enunciado

Iremos fazer finetuning de um buscador denso

Usar como treino o dataset "tiny" do MS MARCO
https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv

Avaliar o modelo no TREC-COVID, e comparar os resultados com o BM25 e doc2query

Comparar busca "exaustiva" (semelhança do vetor query com todos os vetores do corpus) com a busca aproximada (Approximate Nearest Neighbor - ANN)

Para a busca aproximada, usar os algoritmos existentes na biblioteca sentence-transformers (ex: hnswlib) OU implemente um você mesmo (Bonus!)

Dicas:

- Usar a média dos vetores da última camada (conhecido como mean pooling) do transformer para representar queries e passagens; Alternativamente, usar apenas o vetor do [CLS] da última cada.

- Tente inicialmente uma loss facil de implementar, como a entropia-cruzada

- Começar o treino a partir do microsoft/MiniLM-L12-H384-uncased

- Avaliar o pipeline usando um modelo já bem treinado: sentence-transformers/all-mpnet-base-v2

- Comparar resultados usando semelhança de coseno e produto escalar como funções de similaridade

- Para checar se seu codigo de avaliação está correto, comparar o seu desempenho com o do modelo já treinado no MS MARCO: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2; O nDCG@10 no TREC-COVID deve ser ~0.47

- Usar a biblioteca do sentence-transformers para avaliar o modelo



In [1]:
!pip install transformers -q
!pip install evaluate -q
!pip install ftfy -q
!pip install sentencepiece -q
!pip install pyserini -q
!pip install faiss-gpu -q

In [2]:
### Used only to run on Google Colab
from google.colab import drive
drive.mount('/content/gdrive')

# Change de path to your drive
base_path = "gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio6"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
model_id = "microsoft/MiniLM-L12-H384-uncased"
max_length = 256 
batch_size = 32
epochs = 20
lr = 2e-5

In [19]:
from transformers import (
    AutoModel,
    AutoTokenizer,
    BatchEncoding,
    get_cosine_with_hard_restarts_schedule_with_warmup,
    AdamW
)

from torch.utils import data
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import ftfy
import torch
from tqdm.auto import tqdm
import json
import os

In [5]:
class Dataset(data.Dataset):
  def __init__(self, tokenizer, text, max_seq_length):
      self.max_seq_length = max_seq_length
      self.tokenizer = tokenizer
      self.text = text
      self.cache = {}

  def __len__(self):
      return len(self.text)

  def __getitem__(self, idx):
      item = self.cache.get(str(idx))
      if not item:
          item = self.tokenizer(
              self.text[idx],
              padding=True,
              truncation=True,
              max_length=self.max_seq_length
          )
      self.cache[str(idx)] = item

      return item

In [6]:
def collate_fn(batch):
    return BatchEncoding(tokenizer.pad(batch, return_tensors='pt'))

In [7]:
def create_dataloaders(tokenizer: AutoTokenizer):
  df_data = pd.read_csv(f"{base_path}/data/msmarco_triples.train.tiny.tsv", sep='\t', names=['query', 'relevant', 'not_relevant'], header=None)
  df_data['relevant'] = df_data['relevant'].apply(lambda text: ftfy.fix_text(text))

  train_df, val_df = train_test_split(df_data, test_size=0.1, random_state=42)

  dataset_queries_train = Dataset(tokenizer, train_df['query'].tolist(), max_length)
  dataset_passages_train = Dataset(tokenizer, train_df['relevant'].tolist(), max_length)

  dataset_queries_val = Dataset(tokenizer, val_df['query'].tolist(), max_length)
  dataset_passages_val = Dataset(tokenizer, val_df['relevant'].tolist(), max_length)

  dataloader_queries_train = DataLoader(dataset_queries_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
  dataloader_passages_train = DataLoader(dataset_passages_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

  dataloader_queries_val = DataLoader(dataset_queries_val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
  dataloader_passages_val = DataLoader(dataset_passages_val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

  return dataloader_queries_train, dataloader_passages_train, dataloader_queries_val, dataloader_passages_val

In [8]:
def compute_loss(query_model, tokenized_queries, passage_model, tokenized_passages):
    outputs_queries = query_model(**tokenized_queries.to(device))
    outputs_docs = passage_model(**tokenized_passages.to(device))

    # Get CLS layer
    cls_queries = outputs_queries.last_hidden_state[:, 0, :]
    cls_docs = outputs_docs.last_hidden_state[:, 0, :]

    # Compute the similarity between query and passage => sim(q, p)
    similarity = torch.matmul(cls_queries, torch.transpose(cls_docs, 0, 1))

    # Comput the exponential of similarity
    exp_sim = torch.exp(similarity)

    # Comput the loss
    total = exp_sim.sum(dim=1)
    diagonal = torch.diag(exp_sim)
    log_loss = -1 * torch.log(diagonal/total)
    
    loss = torch.mean(log_loss)

    return loss

In [9]:
def compute_loss_eval(query_model, dataloader_query, passage_model, dataloader_passage):
  with torch.no_grad():
    loss = 0
    n_batches = 0

    for batch_query, batch_docs in zip(dataloader_query, dataloader_passage):
        loss = loss + compute_loss(query_model, batch_query, passage_model, batch_docs)
        n_batches += 1

    return loss/n_batches

In [10]:
def train():
  optim_query = AdamW(query_model.parameters(), lr=lr)
  optim_passage = AdamW(passage_model.parameters(), lr=lr)

  num_training_steps = epochs * len(dataloader_queries_train)
  num_warmup_steps = int(num_training_steps * 0.1)

  scheduler_query = get_cosine_with_hard_restarts_schedule_with_warmup(optim_query, num_warmup_steps, num_training_steps)
  scheduler_doc = get_cosine_with_hard_restarts_schedule_with_warmup(optim_passage, num_warmup_steps, num_training_steps)

  tqdm_epoch = tqdm(range(epochs))
  for epoch in tqdm_epoch:
      query_model.train()
      passage_model.train()
      
      train_losses = []
      tqdm_batches = tqdm(list(zip(dataloader_queries_train, dataloader_passages_train)), mininterval=0.5, desc='Train', disable=False)
      for batch_query, batch_docs in tqdm_batches:
          optim_query.zero_grad()
          optim_passage.zero_grad()
          
          loss = compute_loss(query_model, batch_query, passage_model, batch_docs)
          loss.backward()
          
          optim_query.step()
          optim_passage.step()

          scheduler_query.step()
          scheduler_doc.step()

          tqdm_batches.set_description("Loss {:0.4f}".format(loss.detach().cpu().numpy()))
      
      query_model.eval()
      passage_model.eval()
      
      train_loss = compute_loss_eval(query_model, dataloader_queries_train, passage_model, dataloader_passages_train)
      eval_loss = compute_loss_eval(query_model, dataloader_queries_val, passage_model, dataloader_passages_val)

      txt_train_loss = "Trainning Loss: {:0.4f}".format(train_loss)
      txt_eval_loss = "Validation Loss: {:0.4f}".format(eval_loss)

      msg = f"Epoch: {epoch} {txt_train_loss} {txt_eval_loss}"
      tqdm_epoch.set_description(msg)
  
  query_model.save_pretrained(f'{base_path}/model/query/')
  passage_model.save_pretrained(f'{base_path}/model/doc/')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_id)
query_model = AutoModel.from_pretrained(model_id).to(device)
passage_model = AutoModel.from_pretrained(model_id).to(device)

In [None]:
dataloader_queries_train, dataloader_passages_train, dataloader_queries_val, dataloader_passages_val = create_dataloaders(tokenizer)

In [None]:
train()



  0%|          | 0/20 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

Train:   0%|          | 0/310 [00:00<?, ?it/s]

### Avaliação no TREC-COVID

In [11]:
def load_queries_and_passages():
  df_queries = pd.read_csv(f"{base_path}/data/queries.tsv", sep="\t")
  df_passages = pd.read_csv(f"{base_path}/data/corpus.tsv", sep="\t")

  queries = []
  with open(f"{base_path}/data/queries.jsonl") as q:
    for line in q:
      query = json.loads(line)
      queries.append({'id': query['_id'], 'text': query['text']})

  passages = []
  with open(f"{base_path}/data/corpus.jsonl") as corpus:
    for i, line in enumerate(corpus):
      doc = json.loads(line)

      passages.append(
          (doc['_id'], f"{doc['title']} {doc['text']}")
      )

  return queries, passages

In [20]:
def generate_dense_matrix(passages):
  ids_passages, text_passages = zip(*passages)

  if not os.path.isfile(file_dense_matrix):
    text_passages = ["" if str(x) == "nan" else x for x in text_passages]

    dense_matrix = None
    dataset_passages = Dataset(tokenizer, list(text_passages), max_length)
    dataloader_passages = DataLoader(dataset_passages, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    with torch.no_grad():
      for batch in tqdm(dataloader_passages):
        outputs_docs = passage_model(**batch.to(device))
        cls_passages = outputs_docs.last_hidden_state[:, 0, :]

        if dense_matrix is None:
          dense_matrix = cls_passages
        else:
          dense_matrix = torch.cat( (dense_matrix, cls_passages), dim=0)

    torch.save(dense_matrix, file_dense_matrix)
  else:
    dense_matrix = torch.load(file_dense_matrix)

  return dense_matrix, ids_passages

In [13]:
def get_query_matrix(query):
  tokenized_query = tokenizer(query, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

  with torch.no_grad():
    output_query = query_model(**tokenized_query.to(device))
    cls_query = output_query.last_hidden_state[:, 0, :]

  return cls_query[0]

In [14]:
def search(query, n=1000):
  query_matrix = get_query_matrix(query)
  score = torch.matmul(dense_matrix, query_matrix)
  sorted_score, indexes_score = torch.sort(score, descending=True)

  sorted_score = sorted_score[0:n]
  indexes_score = indexes_score[0:n]

  ids_docs = [ids_passages[i] for i in indexes_score]

  return zip(ids_docs, sorted_score)

In [15]:
def run_all_queries(file_name):
  with open(file_name, 'w') as file:
    for query in tqdm(queries):
      query_id = query['id']
      text = query['text']

      passages_score = search(text, n=1000)
      
      i = 0
      for id_doc, score in passages_score:
        file.write('{} Q0 {} {} {:.6f} dense_search\n'.format(query_id, id_doc, i+1, float(score)))
        i += 1

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
saved_model_query = f"{base_path}/model/query/"
saved_model_doc = f"{base_path}/model/doc/"
file_dense_matrix = f"{base_path}/data/dense_matrix.pt"

tokenizer = AutoTokenizer.from_pretrained(model_id)
query_model = AutoModel.from_pretrained(saved_model_query).to(device)
passage_model = AutoModel.from_pretrained(saved_model_doc).to(device)

In [21]:
queries, passages = load_queries_and_passages()

In [22]:
dense_matrix, ids_passages = generate_dense_matrix(passages)

In [23]:
run_all_queries(f"{base_path}/runs/run-dense-search.txt")

  0%|          | 0/50 [00:00<?, ?it/s]

In [26]:
!python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap -mndcg_cut.10 -mrecip_rank.100 \
    {base_path}/data/test.tsv {base_path}/runs/run-dense-search.txt

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-mrecall.1000', '-mmap', '-mndcg_cut.10', '-mrecip_rank.100', 'gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio6/data/test.tsv', 'gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio6/runs/run-dense-search.txt']
Results:
map                   	all	0.0596
recip_rank            	all	0.5980
recall_1000           	all	0.1757
ndcg_cut_10           	all	0.3308
