# Exercício 8 - Reranking Sentence Tranformers

## Instalação de Pacotes

In [None]:
# !pip install transformers -q
# !pip install datasets -q
# !pip install pyserini -q
# !pip install faiss-gpu -q
# !pip install torch -q
# !pip install sentence-transformers -q

In [1]:
### Used only to run on Google Colab
# from google.colab import drive
# drive.mount('/content/gdrive')

# Change de path to your drive
# base_path = "gdrive/MyDrive/Colab_Notebooks/P_IA368DD_2023S1/Exercicio2/"
base_path = "."

In [2]:
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import InputExample
from datasets import load_dataset
from sentence_transformers import CrossEncoder

from sklearn.model_selection import train_test_split
import pickle
import json
from tqdm.auto import tqdm
import math
from sklearn.utils import resample

In [3]:
random.seed(12)
np.random.seed(12)
torch.manual_seed(12)

<torch._C.Generator at 0x7fcc87bd82f0>

## Funções do projeto

In [4]:
def prepare_train_dataset(batch_size=32) -> tuple:
    df_data_merge = pd.read_csv(f"{base_path}/data/data.csv")
    df_data_merge['score'] = df_data_merge['score'].astype(float)
    positives = df_data_merge[df_data_merge["score"] == 1.0]
    negatives  = df_data_merge[df_data_merge["score"] == 0.0]

    negative_balance = resample(
        negatives,
        replace=True,
        n_samples=len(positives),
        random_state=42
    )

    df_data_balance = pd.concat([positives, negative_balance])

    train_samples = []
    for index, row in tqdm(df_data_balance.iterrows(), total=df_data_balance.shape[0]):
        query = row["query"]
        passage = row["passage"]
        sample = InputExample(
            texts=[query, passage], label=row["score"]
        )
        
        train_samples.append(sample)

    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

    return train_dataloader

In [5]:
def train(n_epochs = 5):
    warmup_steps = math.ceil(len(train_dataloader) * n_epochs * 0.1)

    model.fit(
        train_dataloader=train_dataloader,
        epochs=n_epochs,
        warmup_steps=warmup_steps,
        output_path=f"{base_path}/{model_folder}",
        show_progress_bar=True
    )
    
    model.save(f"{base_path}/{model_folder}")

In [6]:
def generate_run_bm25():
    trec_covid_qrels = load_dataset("BeIR/trec-covid-qrels")
    qrels = pd.DataFrame()
    qrels["query_id"] = trec_covid_qrels['test']["query-id"]
    qrels["corpus_id"] = trec_covid_qrels['test']["corpus-id"]
    qrels["score"] = trec_covid_qrels['test']["score"]

    with open(f"{base_path}/data/qrels.tsv", "w") as output:
        for line in qrels.values.tolist():
            query_id = line[0]
            corpus_id = line[1]
            score = line[2]
            output.write(f'{query_id}\t{0}\t{corpus_id.strip()}\t{score}\n')
    
    topics = {}
    with open(f"{base_path}/data/topics.tsv", "w") as output:
        with open(f"{base_path}/data/queries.jsonl", "r") as f:
            for line in f:
                data = json.loads(line)
                id = data['_id'].strip()
                text = data['text'].strip()
                topics[id] = text
                output.write(f'{id}\t{text}\n')
    
    !python -m pyserini.search.lucene \
      --index "beir-v1.0.0-trec-covid.flat" \
      --topics {base_path}/data/topics.tsv \
      --output runs/run.trec-covid.bm25.txt \
      --output-format msmarco \
      --hits 1000 \
      --bm25 --k1 0.82 --b 0.68

In [7]:
model_folder = "model_st"
model_id = "cross-encoder/ms-marco-MiniLM-L-6-v2"
batch_size = 24

In [8]:
model = CrossEncoder(model_id, max_length=512, num_labels=1)

In [9]:
train_dataloader = prepare_train_dataset(batch_size=batch_size)

  0%|          | 0/23060 [00:00<?, ?it/s]

In [10]:
train(5)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/961 [00:00<?, ?it/s]

Iteration:   0%|          | 0/961 [00:00<?, ?it/s]

Iteration:   0%|          | 0/961 [00:00<?, ?it/s]

Iteration:   0%|          | 0/961 [00:00<?, ?it/s]

Iteration:   0%|          | 0/961 [00:00<?, ?it/s]

## Reranqueador

In [None]:
generate_run_bm25()

In [None]:
!python -m pyserini.eval.trec_eval -c -m map -m recip_rank -m ndcg_cut.10 ./data/qrels.tsv ./runs/run.trec-covid.bm25.txt

In [4]:
model_folder = "model_st"
model_name = "minilm_optim"
model = CrossEncoder(model_folder)

In [5]:
def evaluate_test_dataset(model):
    scores = []
    col_names = ["id", "text"]
    df_queries = pd.read_csv(f'{base_path}/data/topics.tsv', sep='\t', names=col_names)

    pickle_file = f"{base_path}/data/passages.pickle"
    with open(pickle_file, "rb") as f:
        passages = pickle.load(f)

    query_ids = []
    passage_ids = []
    num_lines = sum(1 for line in open(f'{base_path}/runs/run.trec-covid.bm25.txt', 'r'))

    with open(f'{base_path}/runs/run.trec-covid.bm25.txt') as f:
        for line in tqdm(f, total=num_lines):
            fields = line.strip().split('\t')
            query_id = fields[0]
            query_ids.append(query_id)
            query_text = df_queries[df_queries.iloc[:, 0] == int(query_id)].iloc[0, 1]

            passage_id = fields[1]
            passage_ids.append(passage_id)
            passage_text = passages[passage_id]["fulltext"]

            query = query_text
            passage = passage_text

            text_pair = (query, passage)
            scores.append(model.predict(text_pair))

    return scores, query_ids, passage_ids

In [6]:
scores, query_ids, passage_ids = evaluate_test_dataset(model=model)

  0%|          | 0/50000 [00:00<?, ?it/s]

In [7]:
results = []

for query, passage, score in zip(query_ids, passage_ids, scores):
    results.append((query, passage, score))

In [8]:
sorted_list = sorted(results, key=lambda x: int(x[0]) + x[2], reverse=True)

In [9]:
with open(f"{base_path}/runs/run.trec-covid.bm25.{model_name}_reranked_st_bl.trec", "w") as f:
    for i, (query_id, passage_id, score) in enumerate(sorted_list):
        f.write(f'{query_id}\tQ0\t{passage_id}\t{i+1}\t{score}\t{model_name}_reranked\n')

In [10]:
!python -m pyserini.eval.trec_eval -c -m map -m recip_rank -m ndcg_cut.10 ./data/qrels.tsv ./runs/run.trec-covid.bm25.{model_name}_reranked_st_bl.trec

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /home/manoel/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/home/manoel/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/home/manoel/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'map', '-m', 'recip_rank', '-m', 'ndcg_cut.10', './data/qrels.tsv', './runs/run.trec-covid.bm25.minilm_optim_reranked_st_bl.trec']
Results:
map                   	all	0.2099
recip_rank            	all	0.8313
ndcg_cut_10           	all	0.6622


In [None]:
df_data_merge = pd.read_csv(f"{base_path}/data/data.csv")
df_data_merge['score'] = df_data_merge['score'].astype(float)