# Exercício 2

Reranqueamento usando um modelo estilo-BERT com o treinamento no dataset do MS MARCO e avaliação no TREC-DL 2020.

O treinamento é igual ao de um classificador binário, que será feito por vocês.

O que muda é a forma de avaliação: reranqueadores precisam ser alimentados com documentos candidatos (ex: trazidos pelo BM25 - exercício aula 1)

**Sugestão:** usar este dataset reduzido do MS MARCO como treinamento, com 10k triplas (query, passagem relevante, passagem não-relevante):
https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv

**Sugestão:** usar miniLM (modelo BERT pequeno, 5x mais rapido) para começar o finetuning: https://huggingface.co/nreimers/MiniLM-L6-H384-uncased pois oferece um bom compromisso entre qualidade e velocidade.

**Sugestão:** Usar este notebook como base Análise de sentimentos (dataset IMDB) usando um modelo estilo BERT: https://colab.research.google.com/drive/10etP7Lb915EC-uEuf1IKC8DYkyg_om6-?usp=sharing

**Sugestão de debug:** Usar este minilm para ver se consegue ndcg ~0.70: https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2
Sugestão: fazer overfit em um batch: treinar por 200 epocas um unico batch, e ver se consegue loss=0, e accuracia=100%, ou ndcg=1


## Instalação de Pacotes

In [None]:
# !pip install transformers -q
# !pip install datasets -q
# !pip install pyserini -q
# !pip install faiss-cpu -q
# !pip install optuna -q

In [1]:
### Used only to run on Google Colab
# from google.colab import drive
# drive.mount('/content/gdrive')

# Change de path to your drive
# base_path = "gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio2/"
base_path = ""

In [2]:
import random
import torch
from torch import nn
from torch import optim
from torch.utils import data
import torch.nn.functional as F

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BatchEncoding
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from statistics import mean
import optuna

## Funções do projeto

In [3]:
def collate_pad_fn(batch):
    return BatchEncoding(tokenizer.pad(batch, return_tensors='pt'))

In [4]:
class Dataset(data.Dataset):
    def __init__(self, queries, passages, targets):
        self.__queries = queries
        self.__passages = passages
        self.__targets = targets
    
    def __len__(self):
        return len(self.__queries['input_ids'])
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.__queries['input_ids'][idx] + self.__passages['input_ids'][idx],
            'attention_mask': self.__queries['attention_mask'][idx] + self.__passages['attention_mask'][idx],
            'labels': int(self.__targets[idx]),
        }

In [5]:
def prepare_train_dataset(df_data, batch_size=32, max_length = 512) -> tuple:
    df_data_pos = pd.DataFrame()
    df_data_neg = pd.DataFrame()

    df_data_pos["query"] = df_data["query"].values
    df_data_pos["passage"] = df_data["passagem relevante"].values
    df_data_pos["score"] = 1

    df_data_neg["query"] = df_data["query"].values
    df_data_neg["passage"] = df_data["passagem não relevante"].values
    df_data_neg["score"] = 0

    df_data_merge = pd.concat([df_data_pos, df_data_neg], axis=0, ignore_index=True)

    df_train, df_val = train_test_split(
        df_data_merge,
        test_size=0.20,
#         stratify=df_data_merge["score"],
        random_state=42
    )

    train_queries = tokenizer(list(df_train["query"]), max_length=max_length, truncation=True)
    train_passages = tokenizer(list(df_train["passage"]), max_length=max_length, truncation=True)

    val_queries = tokenizer(list(df_val["query"]), max_length=max_length, truncation=True)
    val_passages = tokenizer(list(df_val["passage"]), max_length=max_length, truncation=True)

    train_dataset = Dataset(train_queries, train_passages, list(df_train["score"]))
    val_dataset = Dataset(val_queries, val_passages, list(df_val["score"]))

    train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_pad_fn)
    val_dataloader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_pad_fn)
    
    return train_dataloader, val_dataloader

In [6]:
# Função para medir acurácia e loss
def evaluate(model, dataloader, set_name):
    losses = []
    correct = 0
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False):
            outputs = model(**batch.to(device))
            loss_val = outputs.loss
            losses.append(loss_val.cpu().item())
            preds = outputs.logits.argmax(dim=1)
            correct += (preds == batch['labels']).sum().item()

    print(f'{set_name} loss: {mean(losses):0.3f}; {set_name} accuracy: {correct / len(dataloader.dataset):0.3f}')

## Preparar os dados para o treinamento

In [7]:
colnames = ["query", "passagem relevante", "passagem não relevante"]
df_data = pd.read_csv(f"{base_path}data/msmarco_triples.train.tiny.tsv", encoding="UTF=8", sep="\t", names=colnames)

In [8]:
# model_id = 'microsoft/MiniLM-L12-H384-uncased'
# model_folder = "model/ranker"
# model_name = "bert"
# batch_size = 32
model_id = 'microsoft/MiniLM-L12-H384-uncased'
model_folder = "model_optuna/ranker"
model_name = "minilm_optuna"
batch_size = 32
# model_id = "roberta-base"
# model_folder = "roberta/ranker"
# model_name = "roberta"
# batch_size = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)

In [9]:
def train(trial: optuna.Trial, n_epochs = 5):
    # drop = trial.suggest_loguniform('dropout', low=0.08, high= 0.20)
    # optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "AdamW"])
    lr = trial.suggest_uniform("lr", 2e-5, 5e-5)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_id
        # attention_probs_dropout_prob = drop,
        # hidden_dropout_prob = drop
    ).to(device)

    epochs = n_epochs
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    # optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    
    num_training_steps = epochs * len(train_dataloader)
    num_warmup_steps = int(num_training_steps * 0.1)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    best_loss = 1e100;

    # Training loop
    for epoch in tqdm(range(epochs), desc='Epochs'):
        model.train()
        train_losses = []
        for batch in tqdm(train_dataloader, mininterval=0.5, desc='Train', disable=False):
            optimizer.zero_grad()
            outputs = model(**batch.to(device))
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_losses.append(loss.cpu().item())

        train_loss = mean(train_losses)
        best_loss = min(train_loss, best_loss)
        print(f'Epoch: {epoch + 1} Training loss: {train_loss:0.2f}')
        evaluate(model=model, dataloader=val_dataloader, set_name='Valid')

    model.save_pretrained(f"{base_path}{model_folder}")
    tokenizer.save_pretrained(f"{base_path}{model_folder}")

    return best_loss

In [10]:
train_dataloader, val_dataloader = prepare_train_dataset(df_data)

In [11]:
study = optuna.create_study(study_name='hyper-parameter-search', direction='maximize')

[32m[I 2023-03-14 10:28:51,215][0m A new study created in memory with name: hyper-parameter-search[0m


In [12]:
# train(5)
study.optimize(func = train, n_trials=4)

  lr = trial.suggest_uniform("lr", 2e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Train:   0%|          | 0/550 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1 Training loss: 0.41


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.234; Valid accuracy: 0.909


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 2 Training loss: 0.21


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.202; Valid accuracy: 0.921


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 3 Training loss: 0.13


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.201; Valid accuracy: 0.928


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 4 Training loss: 0.07


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.227; Valid accuracy: 0.927


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 5 Training loss: 0.05


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.246; Valid accuracy: 0.931


[32m[I 2023-03-14 10:44:33,614][0m Trial 0 finished with value: 0.04609448052985086 and parameters: {'lr': 3.597982627877439e-05}. Best is trial 0 with value: 0.04609448052985086.[0m
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 1 Training loss: 0.42


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.243; Valid accuracy: 0.908


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 2 Training loss: 0.21


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.223; Valid accuracy: 0.917


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 3 Training loss: 0.15


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.216; Valid accuracy: 0.925


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 4 Training loss: 0.10


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.237; Valid accuracy: 0.925


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 5 Training loss: 0.07


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.240; Valid accuracy: 0.928


[32m[I 2023-03-14 10:59:30,957][0m Trial 1 finished with value: 0.06743159742348573 and parameters: {'lr': 2.5267344775681153e-05}. Best is trial 1 with value: 0.06743159742348573.[0m
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 1 Training loss: 0.41


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.240; Valid accuracy: 0.909


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 2 Training loss: 0.20


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.202; Valid accuracy: 0.922


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 3 Training loss: 0.13


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.229; Valid accuracy: 0.920


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 4 Training loss: 0.08


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.236; Valid accuracy: 0.929


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 5 Training loss: 0.05


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.231; Valid accuracy: 0.930


[32m[I 2023-03-14 11:14:27,929][0m Trial 2 finished with value: 0.04965376502453265 and parameters: {'lr': 3.665173419898367e-05}. Best is trial 1 with value: 0.06743159742348573.[0m
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 1 Training loss: 0.40


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.256; Valid accuracy: 0.905


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 2 Training loss: 0.20


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.204; Valid accuracy: 0.917


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 3 Training loss: 0.12


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.219; Valid accuracy: 0.923


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 4 Training loss: 0.07


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.237; Valid accuracy: 0.927


Train:   0%|          | 0/550 [00:00<?, ?it/s]

Epoch: 5 Training loss: 0.04


Valid:   0%|          | 0/138 [00:00<?, ?it/s]

Valid loss: 0.266; Valid accuracy: 0.928


[32m[I 2023-03-14 11:29:16,678][0m Trial 3 finished with value: 0.038258064545437016 and parameters: {'lr': 4.1330698575191135e-05}. Best is trial 1 with value: 0.06743159742348573.[0m


In [13]:
#obtendo os melhores valores gerados
print(study.best_value) 
print(study.best_params) 
print(study.best_trial)

0.06743159742348573
{'lr': 2.5267344775681153e-05}
FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.06743159742348573], datetime_start=datetime.datetime(2023, 3, 14, 10, 44, 33, 615984), datetime_complete=datetime.datetime(2023, 3, 14, 10, 59, 30, 957678), params={'lr': 2.5267344775681153e-05}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'lr': FloatDistribution(high=5e-05, log=False, low=2e-05, step=None)}, trial_id=1, value=None)


## Reranqueador

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_id = f"{base_path}{model_folder}"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)

In [15]:
def prepare_test_dataset():
    col_names = ["id", "text"]
    df_queries = pd.read_csv(f'{base_path}../Exercicio1/topics.dl20.small.tsv', sep='\t', names=col_names)
    df_passages = pd.read_csv(f'{base_path}../Exercicio1/data/collection.tsv', sep='\t', names=col_names)

    max_length = 512
    query_ids = []
    queries = []
    passage_ids = []
    passages = []
    num_lines = sum(1 for line in open(f'{base_path}data/run.dl20.bm25tuned.txt', 'r'))

    with open(f'{base_path}data/run.dl20.bm25tuned.txt') as f:
        for line in tqdm(f, total=num_lines):
            fields = line.strip().split('\t')
            query_id = fields[0]
            query_ids.append(query_id)
            query_text = df_queries[df_queries.iloc[:,0] == int(query_id)].iloc[0,1]
            queries.append(query_text)

            passage_id = fields[1]
            passage_ids.append(passage_id)
            passage_text = df_passages[df_passages.iloc[:,0] == int(passage_id)].iloc[0,1]
            passages.append(passage_text)

    tokenized_queries = tokenizer(queries, max_length=max_length, truncation=True)
    tokenized_passages = tokenizer(passages, max_length=max_length, truncation=True)
    
    test_dataset = Dataset(tokenized_queries, tokenized_passages, [1]*len(tokenized_queries['input_ids']))
    test_dataloader = data.DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_pad_fn)
    
    return test_dataloader, query_ids, passage_ids

In [16]:
def evaluate_test_dataset(model, dataloader, set_name):
    scores = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False):
            outputs = model(**batch.to(device))
            pos_score = torch.softmax(outputs.logits,1)[:,1]
            scores = scores + pos_score.tolist()
    return scores

In [17]:
test_dataloader, query_ids, passage_ids = prepare_test_dataset()

  0%|          | 0/54000 [00:00<?, ?it/s]

In [18]:
scores = evaluate_test_dataset(model=model, dataloader=test_dataloader, set_name='Test')

Test:   0%|          | 0/1688 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
results = []

for query, passage, score in zip(query_ids, passage_ids, scores):
    results.append((query, passage, score))

In [20]:
sorted_list = sorted(results, key=lambda x: int(x[0]) + x[2], reverse=True)

In [21]:
with open(f"{base_path}run.dl20.{model_name}_reranked.trec", "w") as f:
    for i, (query_id, passage_id, score) in enumerate(sorted_list):
        f.write(f'{query_id}\tQ0\t{passage_id}\t{i+1}\t{score}\t{model_name}_reranked\n')

In [22]:
!../Exercicio1/tools/anserini-tools-master/eval/trec_eval.9.0.4/trec_eval -c -m map -m ndcg_cut.10 -l 2 \
   {base_path}data/qrels.dl20-passage.trec {base_path}run.dl20.{model_name}_reranked.trec

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
map                   	all	0.4242
ndcg_cut_10           	all	0.6030
