# Text Retrieval with Multi-Stage Re-Ranking Models

In [1]:
import torch
print(torch.version.cuda)  # Check the CUDA version
print(torch.cuda.is_available())  # Check if CUDA is available

11.8
True


# Model

In [1]:
%%bash
cd /home/hoang/multi-stage-reranking/model/pre_trained_models
wget -q -O MiniLMv2-L6-H768-distilled-from-RoBERTa-Large.zip https://1ubnpq.bn.files.1drv.com/y4mIX6ParIAPno8mrrumh3CSQIi7cu5LzTBRWVS1jOO-2ddbEItW4EhjD_qg7R_KMjbekZcpfUHTLpwbOlv86gidJFbwMEkq4s8CDtNMDseDn1ebWmv5LDSUjXbEtg-a4DXlNKimn3hefuz6rewH199n8nGIxqtmPNHVzLwL052oq49bKW1rZv_yf2AWV6TgTP9CI2JWK9NwCyjIKQ__6AMow # MiniLM-L6-H768-distilled-from-RoBERTa-Large
unzip -q MiniLMv2-L6-H768-distilled-from-RoBERTa-Large.zip
rm MiniLMv2-L6-H768-distilled-from-RoBERTa-Large.zip
wget -q -O MiniLMv2-L6-H384-distilled-from-RoBERTa-Large.zip https://yeb8mw.bn.files.1drv.com/y4mbiQS6lq_n5yOdYnW5Bi7-Jw-yzU3p4WNPuIe4h1ejLyoDdfJTx9qOhHqowoH3zQ2FkmurdO1FY9igoXBv_s7yV2GcWSSnH-A4Gaa56_EoMM4FTDZY_x84k1lfOXekpEyjmVP49hCmo7D9agfuVpM5_TCKCIJKS9QVW5upX3RQ3cSjojOccfOtOl5iamlCpKTOwS94SZB7SuxcADsKvoGtQ # MiniLM-L6-H384-distilled-from-RoBERTa-Large
unzip -q MiniLMv2-L6-H384-distilled-from-RoBERTa-Large.zip
rm MiniLMv2-L6-H384-distilled-from-RoBERTa-Large.zip

In [2]:
!ls /home/hoang/multi-stage-reranking/model/pre_trained_models

MiniLM-L6-H384-distilled-from-RoBERTa-Large
MiniLM-L6-H768-distilled-from-RoBERTa-Large


# Dataset

In [5]:
import json

with open("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/legal_corpus_ms_marco.json", "r", encoding="utf-8") as f:
    legal_corpus = json.load(f)

new_legal_corpus = []
id = 0
for laws in legal_corpus:
    for article in laws["articles"]:
        new_legal_corpus.append({
            "_id": "corpus_"+str(id),
            "law_id": laws["law_id"],
            "article_id": article["article_id"],
            "title": article["title"],
            "text": article["text"]
        })
        id+=1
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/corpus.jsonl", "w") as f:
    for law in new_legal_corpus:
        json_line = json.dumps(law, ensure_ascii=False)
        f.write(json_line + "\n")

In [6]:
new_legal_corpus[0]

{'_id': 'corpus_0',
 'law_id': '',
 'article_id': '1',
 'title': '',
 'text': "Domingo said that even if Holmes left Cruise to protect her daughter from the church, he likely won't give up on bringing her back to the religion. According to Scientology doctrine, Katie has denied Suri her spiritual eternity in the church. There's no chance for her now, Domingo explained to Vanity Fair. Tom Cruise admits Katie Holmes left to protect Suri from Scientology. Tom Cruise admits Scientology may have caused split with Katie Holmes-and ... Tom Cruise Admits Scientology Contributed to Katie Holmes Split."}

In [None]:
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/train_retrieval_ms_marco.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/validation_retrieval_ms_marco.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)
with open("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/test_retrieval_ms_marco.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

queries = []
id = 0
for question in train_data["items"]+dev_data["items"]+test_data["items"]:
    queries.append({
        "_id": "query_"+str(id),
        "text": question["question_full"],
        "relevant_articles": question["relevant_articles"]
    })
    id+=1

with open("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/queries.jsonl", "w") as f:
    for question in queries:
        json_line = json.dumps(question, ensure_ascii=False)
        f.write(json_line + "\n")

In [3]:
queries[0]

{'_id': 'query_0',
 'text': 'what is rba',
 'relevant_articles': [{'law_id': '', 'article_id': '8274'}]}

In [7]:
import pandas as pd
import os

os.makedirs('/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/qrels', exist_ok=True)
train = {"query-id": [], "corpus-id": [], "score":[]}
dev = {"query-id": [], "corpus-id": [], "score":[]}
test = {"query-id": [], "corpus-id": [], "score":[]}

id = 0
for question in train_data["items"]:
    for law in question["relevant_articles"]:
        for corpus in new_legal_corpus:
            if law['law_id'] == corpus['law_id'] and law['article_id'] == corpus['article_id']:
                train["query-id"].append("query_"+str(id))
                train["corpus-id"].append(corpus["_id"])
                train["score"].append(1)
                break
    id+=1
train_df = pd.DataFrame(train)
train_df.to_csv("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/qrels/train.tsv", sep="\t", index=False)

for question in dev_data["items"]:
    for law in question["relevant_articles"]:
        for corpus in new_legal_corpus:
            if law['law_id'] == corpus['law_id'] and law['article_id'] == corpus['article_id']:
                dev["query-id"].append("query_"+str(id))
                dev["corpus-id"].append(corpus["_id"])
                dev["score"].append(1)
                break
    id+=1
dev_df = pd.DataFrame(dev)
dev_df.to_csv("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/qrels/dev.tsv", sep="\t", index=False)

for question in test_data["items"]:
    for law in question["relevant_articles"]:
        for corpus in new_legal_corpus:
            if law['law_id'] == corpus['law_id'] and law['article_id'] == corpus['article_id']:
                test["query-id"].append("query_"+str(id))
                test["corpus-id"].append(corpus["_id"])
                test["score"].append(1)
                break
    id+=1
test_df = pd.DataFrame(test)
test_df.to_csv("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/qrels/test.tsv", sep="\t", index=False)

In [8]:
for corpus in new_legal_corpus:
    del corpus["law_id"]
    del corpus["title"]
    del corpus["article_id"]

with open("/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco/corpus.jsonl", "w") as f:
    for law in new_legal_corpus:
        json_line = json.dumps(law, ensure_ascii=False)
        f.write(json_line + "\n")

In [9]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/original

138M	/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco
6.7M	/home/hoang/multi-stage-reranking/dataset/beir/original/12_7
144M	/home/hoang/multi-stage-reranking/dataset/beir/original


## Preprocess

In [10]:
%%bash
cd /home/hoang/multi-stage-reranking
python preprocess_beir.py \
--data_path dataset/beir/original/msmarco \
--output_data_path dataset/beir/processed/msmarco \
--model_name_or_path FacebookAI/roberta-large

100%|██████████| 133796/133796 [01:40<00:00, 1335.95it/s]
100%|██████████| 17132/17132 [00:02<00:00, 7793.99it/s]
16970it [00:00, 395416.48it/s]
1122it [00:00, 450853.52it/s]
975it [00:00, 410339.80it/s]


In [11]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/processed

280M	/home/hoang/multi-stage-reranking/dataset/beir/processed/msmarco
8.5M	/home/hoang/multi-stage-reranking/dataset/beir/processed/12_7
289M	/home/hoang/multi-stage-reranking/dataset/beir/processed


In [None]:
# !pip install pyserini==0.19.2 numpy==1.26.1 faiss-cpu==1.7.4

In [12]:
%%bash
cd /home/hoang/multi-stage-reranking
python preprocess_bm25.py \
--data_path dataset/beir/processed/msmarco \
--output_data_path dataset/beir/processed_bm25/msmarco \
--bm25_num_candidate 300

2024-11-05 04:25:48,194 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Setting log level to INFO
2024-11-05 04:25:48,198 INFO  [main] index.IndexCollection (IndexCollection.java:394) - Starting indexer...
2024-11-05 04:25:48,198 INFO  [main] index.IndexCollection (IndexCollection.java:396) - DocumentCollection path: dataset/beir/processed_bm25/msmarco/document_processed
2024-11-05 04:25:48,199 INFO  [main] index.IndexCollection (IndexCollection.java:397) - CollectionClass: JsonCollection
2024-11-05 04:25:48,199 INFO  [main] index.IndexCollection (IndexCollection.java:398) - Generator: DefaultLuceneDocumentGenerator
2024-11-05 04:25:48,199 INFO  [main] index.IndexCollection (IndexCollection.java:399) - Threads: 1
2024-11-05 04:25:48,200 INFO  [main] index.IndexCollection (IndexCollection.java:400) - Language: en
2024-11-05 04:25:48,200 INFO  [main] index.IndexCollection (IndexCollection.java:401) - Stemmer: porter
2024-11-05 04:25:48,200 INFO  [main] index.IndexCollecti

100%|██████████| 15270/15270 [03:15<00:00, 78.06it/s]
100%|██████████| 1000/1000 [00:12<00:00, 77.64it/s]
100%|██████████| 862/862 [00:10<00:00, 79.55it/s]


In [13]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/processed_bm25

214M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/msmarco
27M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/12_7
241M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25


# Training

## Normal (pointwise) LM

### seed=0

In [1]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/pre_trained_models/MiniLM-L6-H384-distilled-from-RoBERTa-Large \
--tokenizer_name_or_path FacebookAI/roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--train_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 128 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
11/05/2024 07:26:06 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/msmarco/document.json', id2query_path='dataset/beir/processed/msmarco/query.json', train_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/dev.json', test_query2doc_path=None, source_block_size=128, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128', do_train=True, do_eval=False, do_test=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=True, ig

In [3]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
--tokenizer_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
--do_test \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--train_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 128 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
11/05/2024 07:42:39 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/msmarco/document.json', id2query_path='dataset/beir/processed/msmarco/query.json', train_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/dev.json', test_query2doc_path=None, source_block_size=128, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128', do_train=False, do_eval=False, do_test=True, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=True, ig

### seed=1

In [4]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/pre_trained_models/MiniLM-L6-H384-distilled-from-RoBERTa-Large \
--tokenizer_name_or_path FacebookAI/roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--train_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 1 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 128 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
11/05/2024 07:43:19 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/msmarco/document.json', id2query_path='dataset/beir/processed/msmarco/query.json', train_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/dev.json', test_query2doc_path=None, source_block_size=128, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128', do_train=True, do_eval=False, do_test=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=1, data_seed=None, n_gpu=1, device='cuda', fp16=True, ig

In [5]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128 \
--tokenizer_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128 \
--do_test \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--train_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 128 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking


11/05/2024 07:57:05 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/msmarco/document.json', id2query_path='dataset/beir/processed/msmarco/query.json', train_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/dev.json', test_query2doc_path=None, source_block_size=128, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128', do_train=False, do_eval=False, do_test=True, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100, data_size=1000000

### seed=2

In [6]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/pre_trained_models/MiniLM-L6-H384-distilled-from-RoBERTa-Large \
--tokenizer_name_or_path FacebookAI/roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--train_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 2 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 128 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
11/05/2024 07:57:34 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/msmarco/document.json', id2query_path='dataset/beir/processed/msmarco/query.json', train_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/dev.json', test_query2doc_path=None, source_block_size=128, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128', do_train=True, do_eval=False, do_test=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=2, data_seed=None, n_gpu=1, device='cuda', fp16=True, ig

In [7]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128 \
--tokenizer_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128 \
--do_test \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--train_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 128 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
11/05/2024 08:10:14 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/msmarco/document.json', id2query_path='dataset/beir/processed/msmarco/query.json', train_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/dev.json', test_query2doc_path=None, source_block_size=128, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128', do_train=False, do_eval=False, do_test=True, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=True, ig

## Larger LM

In [None]:
!nvidia-smi

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path FacebookAI/xlm-roberta-large \
--tokenizer_name_or_path FacebookAI/xlm-roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--num_train_epochs 30 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path ./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--tokenizer_name_or_path ./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--do_test \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0 \
--num_train_epochs 30 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

## Pairwise LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python train.py \
--model_name_or_path FacebookAI/xlm-roberta-base \
--tokenizer_name_or_path FacebookAI/xlm-roberta-base \
--do_train \
--task_type pairwise --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--train_query2doc_path dataset/beir/processed_bm25/12_7/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/dev.json \
--output_dir ./model/fine_tuned_models/xlm-roberta-base_12_7_pairwise_all_e30_ns1_lr5e-5_s0 \
--num_train_epochs 30 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda

# Evaluation

## Only BM25

In [20]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--use_bm25

/home/hoang/multi-stage-reranking
100%|███████████████████████████████████████| 862/862 [00:00<00:00, 2155.88it/s]
Search time:0.41197872161865234
MRR@10: 0.3298
MAP@10: 0.3211
Recall@1: 0.1742		My_recall@1: 0.1879
Recall@3: 0.3817		My_recall@3: 0.3817
Recall@5: 0.5050		My_recall@5: 0.5050
Recall@10: 0.6651		My_recall@10: 0.6651
Recall@20: 0.7728		My_recall@20: 0.7728
Recall@50: 0.8659		My_recall@50: 0.8659
Recall@100: 0.9074		My_recall@100: 0.9074
Recall@200: 0.9495		My_recall@200: 0.9495


## Only Normal LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 200 \
--source_block_size 128 \
--bert_task_type classification \
--use_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 862/862 [11:52<00:00,  1.21it/s]
Search time:609.6778078079224
MRR@10: 0.0000
MAP@10: 0.0000
Recall@1: 0.0000                My_recall@1: 0.0000
Recall@3: 0.0000                My_recall@3: 0.0000
Recall@5: 0.0000                My_recall@5: 0.0000
Recall@10: 0.0000               My_recall@10: 0.0000
Recall@20: 0.0000               My_recall@20: 0.0000
Recall@50: 0.0000               My_recall@50: 0.0000
Recall@100: 0.0000              My_recall@100: 0.0000
Recall@200: 0.0000              My_recall@200: 0.0000


In [21]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 200 \
--source_block_size 512 \
--bert_task_type classification \
--use_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 862/862 [14:17<00:00,  1.01it/s]
Search time:857.1040811538696
MRR@10: 0.0000
MAP@10: 0.0000
Recall@1: 0.0000		My_recall@1: 0.0000
Recall@3: 0.0000		My_recall@3: 0.0000
Recall@5: 0.0000		My_recall@5: 0.0000
Recall@10: 0.0000		My_recall@10: 0.0000
Recall@20: 0.0000		My_recall@20: 0.0000
Recall@50: 0.0000		My_recall@50: 0.0000
Recall@100: 0.0000		My_recall@100: 0.0000
Recall@200: 0.0000		My_recall@200: 0.0000


## BM25 + Normal LM

In [1]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 200 \
--source_block_size 128 \
--bert_task_type classification \
--use_bm25 --use_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 862/862 [03:14<00:00,  4.43it/s]
Search time:194.71424746513367
MRR@10: 0.3505
MAP@10: 0.3392
Recall@1: 0.1793		My_recall@1: 0.1972
Recall@3: 0.4003		My_recall@3: 0.4004
Recall@5: 0.5394		My_recall@5: 0.5394
Recall@10: 0.7101		My_recall@10: 0.7101
Recall@20: 0.8207		My_recall@20: 0.8207
Recall@50: 0.8892		My_recall@50: 0.8892
Recall@100: 0.9356		My_recall@100: 0.9356
Recall@200: 0.9495		My_recall@200: 0.9495


In [22]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 200 \
--source_block_size 512 \
--bert_task_type classification \
--use_bm25 --use_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 862/862 [05:07<00:00,  2.80it/s]
Search time:307.8393383026123
MRR@10: 0.3463
MAP@10: 0.3360
Recall@1: 0.1734		My_recall@1: 0.1914
Recall@3: 0.4077		My_recall@3: 0.4078
Recall@5: 0.5378		My_recall@5: 0.5378
Recall@10: 0.7044		My_recall@10: 0.7044
Recall@20: 0.8020		My_recall@20: 0.8020
Recall@50: 0.9014		My_recall@50: 0.9014
Recall@100: 0.9345		My_recall@100: 0.9345
Recall@200: 0.9495		My_recall@200: 0.9495


## Normal LM + Ensemble

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 250 --second_bert_num_candidate 200 \
--source_block_size 128 --second_source_block_size 128 \
--bert_task_type classification --second_bert_task_type classification \
--use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
--second_model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128

100%|███████████████████████████████████████████| 862/862 [17:25<00:00,  1.21s/it]
Search time:1045.5415499210358
MRR@10: 0.0000
MAP@10: 0.0000
Recall@1: 0.0000                My_recall@1: 0.0000
Recall@3: 0.0000                My_recall@3: 0.0000
Recall@5: 0.0000                My_recall@5: 0.0000
Recall@10: 0.0000               My_recall@10: 0.0000
Recall@20: 0.0000               My_recall@20: 0.0000
Recall@50: 0.0000               My_recall@50: 0.0000
Recall@100: 0.0000              My_recall@100: 0.0000
Recall@200: 0.0000              My_recall@200: 0.0000


In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--   250 --second_bert_num_candidate 200 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type classification \
--use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2 \
--second_model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 862/862 [29:10<00:00,  2.03s/it]
Search time:1135.192467212677
MRR@10: 0.0000
MAP@10: 0.0000
Recall@1: 0.0000                My_recall@1: 0.0000
Recall@3: 0.0000                My_recall@3: 0.0000
Recall@5: 0.0000                My_recall@5: 0.0000
Recall@10: 0.0000               My_recall@10: 0.0000
Recall@20: 0.0000               My_recall@20: 0.0000
Recall@50: 0.0000               My_recall@50: 0.0000
Recall@100: 0.0000              My_recall@100: 0.0000
Recall@200: 0.0000              My_recall@200: 0.0000


## BM25 + Normal LM + Ensemble

In [2]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 250 --second_bert_num_candidate 200 \
--source_block_size 128 --second_source_block_size 128 \
--bert_task_type classification --second_bert_task_type classification \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
--second_model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0_128 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1_128 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2_128

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 862/862 [08:43<00:00,  1.65it/s]
Search time:523.6884527206421
MRR@10: 0.3673
MAP@10: 0.3564
Recall@1: 0.1995		My_recall@1: 0.2169
Recall@3: 0.4308		My_recall@3: 0.4308
Recall@5: 0.5552		My_recall@5: 0.5552
Recall@10: 0.7163		My_recall@10: 0.7163
Recall@20: 0.8142		My_recall@20: 0.8142
Recall@50: 0.8973		My_recall@50: 0.8973
Recall@100: 0.9356		My_recall@100: 0.9356
Recall@200: 0.9519		My_recall@200: 0.9519


In [3]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 250 --second_bert_num_candidate 200 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type classification \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2 \
--second_model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s1 \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s2

/home/hoang/multi-stage-reranking
100%|█████████████████████████████████████████| 862/862 [11:38<00:00,  1.23it/s]
Search time:698.4610295295715
MRR@10: 0.3633
MAP@10: 0.3551
Recall@1: 0.1978		My_recall@1: 0.2146
Recall@3: 0.4168		My_recall@3: 0.4169
Recall@5: 0.5510		My_recall@5: 0.5510
Recall@10: 0.7251		My_recall@10: 0.7251
Recall@20: 0.8192		My_recall@20: 0.8192
Recall@50: 0.8973		My_recall@50: 0.8973
Recall@100: 0.9321		My_recall@100: 0.9321
Recall@200: 0.9503		My_recall@200: 0.9503


## Only Larger LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 \
--source_block_size 512 \
--bert_task_type classification \
--use_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0

## BM25 + Larger LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 200 \
--source_block_size 512 \
--bert_task_type classification \
--use_bm25 --use_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0

## Normal LM + Larger LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 --second_bert_num_candidate 10 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type classification \
--use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--second_model_name_or_path \
./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0

## BM25 + Normal LM + Larger LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 --second_bert_num_candidate 50 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type classification \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--second_model_name_or_path \
./model/fine_tuned_models/xlm-roberta-large_12_7_classification_all_e10_ns1_lr5e-5_s0

## BM25 + Normal LM + Pairwise LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=0 python -u evaluate.py \
--id2doc_path dataset/beir/processed/12_7/document.json \
--id2query_path dataset/beir/processed/12_7/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/12_7/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 --second_bert_num_candidate 10 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type pairwise \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_classification_all_e10_ns1_lr5e-5_s1 \
--second_model_name_or_path \
./model/fine_tuned_models/xlm-roberta-base_12_7_pairwise_all_e30_ns1_lr5e-5_s0