# Text Retrieval with Multi-Stage Re-Ranking Models

In [1]:
import torch
print(torch.version.cuda)  # Check the CUDA version
print(torch.cuda.is_available())  # Check if CUDA is available

11.8
True


# Model

In [59]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base")

# prepare input
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')

# forward pass
output = model(**encoded_input)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
print(tokenizer.model_max_length)
print(tokenizer.max_len_single_sentence)
print(tokenizer.max_len_sentences_pair)

512
510
508


In [None]:
a = tokenizer.encode(text, max_length=500, truncation=True, padding="max_length")
a

In [None]:
output

MaskedLMOutput(loss=None, logits=tensor([[[ 6.4447e+01,  3.2508e-02,  3.7382e+01,  ...,  2.1459e+01,
           1.4222e+01,  1.8874e+01],
         [ 2.7079e+01, -1.3935e+00,  6.4573e+01,  ...,  4.0109e+01,
           1.6137e+01,  3.1009e+01],
         [ 1.9189e+01, -1.2440e+00,  4.8706e+01,  ...,  3.5705e+01,
           1.6987e+01,  2.7256e+01],
         ...,
         [ 2.2506e+01, -1.4501e+00,  5.0936e+01,  ...,  3.8371e+01,
           1.6350e+01,  2.7771e+01],
         [ 2.8184e+01, -1.2711e+00,  6.7431e+01,  ...,  4.4732e+01,
           1.7845e+01,  3.5088e+01],
         [ 4.4540e+01, -1.9992e-01,  4.9368e+01,  ...,  2.8129e+01,
           1.6683e+01,  2.3694e+01]]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

# Dataset

In [23]:
import json

with open("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/legal_corpus_update.json", "r", encoding="utf-8") as f:
    legal_corpus = json.load(f)

new_legal_corpus = []
id = 0
for laws in legal_corpus:
    for article in laws["articles"]:
        new_legal_corpus.append({
            "_id": "corpus_"+str(id),
            "law_id": laws["law_id"],
            "article_id": article["article_id"],
            "title": article["title"],
            "text": article["text"]
        })
        id+=1
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/corpus.jsonl", "w") as f:
    for law in new_legal_corpus:
        json_line = json.dumps(law, ensure_ascii=False)
        f.write(json_line + "\n")

In [24]:
new_legal_corpus[0]

{'_id': 'corpus_0',
 'law_id': '賃金の支払の確保等に関する法律/第一章\u3000総則',
 'article_id': '1',
 'title': '第一条\u3000目的',
 'text': 'この法律は、景気の変動、産業構造の変化その他の事情により企業経営が安定を欠くに至つた場合及び労働者が事業を退職する場合における賃金の支払等の適正化を図るため、貯蓄金の保全措置及び事業活動に著しい支障を生じたことにより賃金の支払を受けることが困難となつた労働者に対する保護措置その他賃金の支払の確保に関する措置を講じ、もつて労働者の生活の安定に資することを目的とする。'}

In [25]:
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/train_12x7_retrieval.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/validation_12x7_retrieval.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/test_12x7_retrieval.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

queries = []
id = 0
for question in train_data["items"]+dev_data["items"]+test_data["items"]:
    queries.append({
        "_id": "query_"+str(id),
        "text": question["question_full"],
        "relevant_articles": question["relevant_articles"]
    })
    id+=1

with open("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/queries.jsonl", "w") as f:
    for question in queries:
        json_line = json.dumps(question, ensure_ascii=False)
        f.write(json_line + "\n")

In [26]:
queries[0]

{'_id': 'query_0',
 'text': '第１章\u3000総則\n（規則の遵守）\n第３条\u3000会社と社員は、ともに本規則を遵守し、相互に協力して社業の発展に努めなければならない。',
 'relevant_articles': [{'law_id': '労働基準法/第一章\u3000総則', 'article_id': '2'}]}

In [27]:
import pandas as pd
import os

os.makedirs('/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/qrels', exist_ok=True)
train = {"query-id": [], "corpus-id": [], "score":[]}
dev = {"query-id": [], "corpus-id": [], "score":[]}
test = {"query-id": [], "corpus-id": [], "score":[]}

id = 0
for question in train_data["items"]:
    for law in question["relevant_articles"]:
        for corpus in new_legal_corpus:
            if law['law_id'] == corpus['law_id'] and law['article_id'] == corpus['article_id']:
                train["query-id"].append("query_"+str(id))
                train["corpus-id"].append(corpus["_id"])
                train["score"].append(1)
                break
    id+=1
train_df = pd.DataFrame(train)
train_df.to_csv("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/qrels/train.tsv", sep="\t", index=False)

for question in dev_data["items"]:
    for law in question["relevant_articles"]:
        for corpus in new_legal_corpus:
            if law['law_id'] == corpus['law_id'] and law['article_id'] == corpus['article_id']:
                dev["query-id"].append("query_"+str(id))
                dev["corpus-id"].append(corpus["_id"])
                dev["score"].append(1)
                break
    id+=1
dev_df = pd.DataFrame(dev)
dev_df.to_csv("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/qrels/dev.tsv", sep="\t", index=False)

for question in test_data["items"]:
    for law in question["relevant_articles"]:
        for corpus in new_legal_corpus:
            if law['law_id'] == corpus['law_id'] and law['article_id'] == corpus['article_id']:
                test["query-id"].append("query_"+str(id))
                test["corpus-id"].append(corpus["_id"])
                test["score"].append(1)
                break
    id+=1
test_df = pd.DataFrame(test)
test_df.to_csv("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/qrels/test.tsv", sep="\t", index=False)

In [28]:
for corpus in new_legal_corpus:
    corpus["text"] = corpus["law_id"] + "\n" + corpus["title"] + "\n" + corpus["text"]
    del corpus["law_id"]
    del corpus["title"]
    del corpus["article_id"]

with open("/home/hoang/multi-stage-reranking/dataset/beir/original/12_7/corpus.jsonl", "w") as f:
    for law in new_legal_corpus:
        json_line = json.dumps(law, ensure_ascii=False)
        f.write(json_line + "\n")

In [29]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/original

47M	/home/hoang/multi-stage-reranking/dataset/beir/original/fiqa
8.0M	/home/hoang/multi-stage-reranking/dataset/beir/original/scifact
3.4G	/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco
2.1G	/home/hoang/multi-stage-reranking/dataset/beir/original/hotpotqa
6.7M	/home/hoang/multi-stage-reranking/dataset/beir/original/12_7
5.5G	/home/hoang/multi-stage-reranking/dataset/beir/original


## Preprocess

In [65]:
%%bash
cd /home/hoang/multi-stage-reranking
python preprocess_beir.py \
--data_path dataset/beir/original/12_7 \
--output_data_path dataset/beir/processed/12_7 \
--model_name_or_path FacebookAI/xlm-roberta-base

  0%|          | 0/743 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 743/743 [00:00<00:00, 2552.17it/s]
100%|██████████| 3462/3462 [00:00<00:00, 7770.78it/s]
3459it [00:00, 272713.73it/s]
101it [00:00, 1173475.63it/s]
292it [00:00, 1574211.78it/s]


In [66]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/processed

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


157M	/home/hoang/multi-stage-reranking/dataset/beir/processed/fiqa
20M	/home/hoang/multi-stage-reranking/dataset/beir/processed/scifact
18G	/home/hoang/multi-stage-reranking/dataset/beir/processed/msmarco
9.9G	/home/hoang/multi-stage-reranking/dataset/beir/processed/hotpotqa
8.5M	/home/hoang/multi-stage-reranking/dataset/beir/processed/12_7
28G	/home/hoang/multi-stage-reranking/dataset/beir/processed


In [None]:
# !pip install pyserini==0.19.2 numpy==1.26.1 faiss-cpu==1.7.4

In [67]:
%%bash
cd /home/hoang/multi-stage-reranking
python preprocess_bm25.py \
--data_path dataset/beir/processed/12_7 \
--output_data_path dataset/beir/processed_bm25/12_7

2024-10-31 03:46:20,817 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Setting log level to INFO
2024-10-31 03:46:20,818 INFO  [main] index.IndexCollection (IndexCollection.java:394) - Starting indexer...
2024-10-31 03:46:20,819 INFO  [main] index.IndexCollection (IndexCollection.java:396) - DocumentCollection path: dataset/beir/processed_bm25/12_7/document_processed
2024-10-31 03:46:20,819 INFO  [main] index.IndexCollection (IndexCollection.java:397) - CollectionClass: JsonCollection
2024-10-31 03:46:20,819 INFO  [main] index.IndexCollection (IndexCollection.java:398) - Generator: DefaultLuceneDocumentGenerator
2024-10-31 03:46:20,819 INFO  [main] index.IndexCollection (IndexCollection.java:399) - Threads: 1
2024-10-31 03:46:20,819 INFO  [main] index.IndexCollection (IndexCollection.java:400) - Language: en
2024-10-31 03:46:20,819 INFO  [main] index.IndexCollection (IndexCollection.java:401) - Stemmer: porter
2024-10-31 03:46:20,819 INFO  [main] index.IndexCollection 

100%|██████████| 3259/3259 [00:14<00:00, 224.30it/s]
100%|██████████| 73/73 [00:00<00:00, 266.22it/s]
100%|██████████| 130/130 [00:00<00:00, 216.33it/s]


In [68]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/processed_bm25

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


58M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/fiqa
9.6M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/scifact
4.3G	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/msmarco
2.0G	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/hotpotqa
9.9M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/12_7
6.3G	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25


# Training

## Normal (pointwise) LM

### seed=0

In [2]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path FacebookAI/xlm-roberta-base \
--tokenizer_name_or_path FacebookAI/xlm-roberta-base \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
10/31/2024 03:55:23 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/12_7/document.json', id2query_path='dataset/beir/processed/12_7/query.json', train_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s0', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100, data_size=10000

In [3]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--tokenizer_name_or_path ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--do_test \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
10/31/2024 06:38:39 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/12_7/document.json', id2query_path='dataset/beir/processed/12_7/query.json', train_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s0', do_train=False, do_eval=False, do_test=True, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100, 

### seed=1

In [3]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path FacebookAI/xlm-roberta-base \
--tokenizer_name_or_path FacebookAI/xlm-roberta-base \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 1 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
10/31/2024 04:11:38 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/12_7/document.json', id2query_path='dataset/beir/processed/12_7/query.json', train_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=1, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100, data_size=10000

In [2]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--tokenizer_name_or_path ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--do_test \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 1 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
10/31/2024 06:37:43 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/12_7/document.json', id2query_path='dataset/beir/processed/12_7/query.json', train_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1', do_train=False, do_eval=False, do_test=True, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=1, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100, 

### seed=2

In [4]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path FacebookAI/xlm-roberta-base \
--tokenizer_name_or_path FacebookAI/xlm-roberta-base \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s2 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 2 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
10/31/2024 04:22:43 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/12_7/document.json', id2query_path='dataset/beir/processed/12_7/query.json', train_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s2', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=2, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100, data_size=10000

In [1]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s2 \
--tokenizer_name_or_path ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s2 \
--do_test \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s2 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 2 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
10/31/2024 06:36:53 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/12_7/document.json', id2query_path='dataset/beir/processed/12_7/query.json', train_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/12_7/qrels/test.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s2', do_train=False, do_eval=False, do_test=True, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=2, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100,

## Larger LM

In [19]:
!nvidia-smi

Sun Nov  3 14:47:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:86:00.0 Off |                  Off |
| 30%   59C    P2             408W / 450W |  23432MiB / 24564MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        Off | 00000000:AF:00.0 Off |  

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path FacebookAI/xlm-roberta-large \
--tokenizer_name_or_path FacebookAI/xlm-roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--num_train_epochs 30 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

11/03/2024 16:21:58 - INFO - __main__ -   ***** Eval results *****
11/03/2024 16:21:58 - INFO - __main__ -     accuracy = 0.915068493150685
11/03/2024 16:21:58 - INFO - __main__ -     f1 = 0.7633587786259542
11/03/2024 16:21:58 - INFO - __main__ -     loss = 0.4555688118967025
11/03/2024 16:21:58 - INFO - __main__ -     precision = 0.8620689655172413
11/03/2024 16:21:58 - INFO - __main__ -     recall = 0.684931506849315


## Pairwise LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path FacebookAI/xlm-roberta-base \
--tokenizer_name_or_path FacebookAI/xlm-roberta-base \
--do_train \
--task_type pairwise --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_pairwise_all_e30_ns1_lr5e-5_s0 \
--num_train_epochs 30 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda

# Evaluation

## only BM25

In [15]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--use_bm25

/home/hoang/multi-stage-reranking
100%|███████████████████████████████████████| 130/130 [00:00<00:00, 6091.40it/s]
Search time:0.02431035041809082
MRR@10: 0.3857
MAP@10: 0.3234
Recall@1: 0.2090		My_recall@1: 0.2615
Recall@3: 0.3635		My_recall@3: 0.3731
Recall@5: 0.4405		My_recall@5: 0.4447
Recall@10: 0.5254		My_recall@10: 0.5259
Recall@100: 0.8308		My_recall@100: 0.8308
Recall@200: 0.8308		My_recall@200: 0.8308


## BM25 + Normal LM

In [16]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 \
--source_block_size 512 \
--bert_task_type classification \
--use_bm25 --use_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1

/home/hoang/multi-stage-reranking


100%|█████████████████████████████████████████| 130/130 [00:42<00:00,  3.05it/s]
Search time:42.64936542510986
MRR@10: 0.6007
MAP@10: 0.5391
Recall@1: 0.3674		My_recall@1: 0.5000
Recall@3: 0.5797		My_recall@3: 0.5910
Recall@5: 0.6509		My_recall@5: 0.6526
Recall@10: 0.7561		My_recall@10: 0.7563
Recall@100: 0.8308		My_recall@100: 0.8308
Recall@200: 0.8308		My_recall@200: 0.8308


## BM25 + Normal LM + Ensemble

In [17]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 --second_bert_num_candidate 10 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type classification \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--second_model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s0 \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s2

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 130/130 [00:55<00:00,  2.36it/s]
Search time:55.067251205444336
MRR@10: 0.6505
MAP@10: 0.5892
Recall@1: 0.4156		My_recall@1: 0.5538
Recall@3: 0.6276		My_recall@3: 0.6397
Recall@5: 0.7074		My_recall@5: 0.7091
Recall@10: 0.7561		My_recall@10: 0.7563
Recall@100: 0.8308		My_recall@100: 0.8308
Recall@200: 0.8308		My_recall@200: 0.8308


## BM25 + Larger LM

In [2]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 \
--source_block_size 512 \
--bert_task_type classification \
--use_bm25 --use_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 130/130 [03:29<00:00,  1.61s/it]
Search time:209.9298689365387
MRR@10: 0.7444
MAP@10: 0.6861
Recall@1: 0.5397		My_recall@1: 0.6923
Recall@3: 0.7017		My_recall@3: 0.7167
Recall@5: 0.7415		My_recall@5: 0.7432
Recall@10: 0.7850		My_recall@10: 0.7854
Recall@100: 0.8308		My_recall@100: 0.8308
Recall@200: 0.8308		My_recall@200: 0.8308


## BM25 + Normal LM + Pairwise LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 --second_bert_num_candidate 10 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type pairwise \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--second_model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_pairwise_all_e30_ns1_lr5e-5_s0