# Text Retrieval with Multi-Stage Re-Ranking Models

In [1]:
!python -V

Python 3.10.15


In [None]:
%cd /home/hoang
!git clone https://github.com/trunghoang2002/multi-stage-reranking.git
%cd multi-stage-reranking

# Requirment

In [None]:
'''
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
conda install conda-forge::transformers
conda install scikit-learn
pip install tqdm pytrec_eval
# conda install -c conda-forge huggingface_hub # ko cần thiết
'''

In [1]:
!java --version

openjdk 17.0.12 2024-07-16
OpenJDK Runtime Environment (build 17.0.12+7-Ubuntu-1ubuntu220.04)
OpenJDK 64-Bit Server VM (build 17.0.12+7-Ubuntu-1ubuntu220.04, mixed mode, sharing)


In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [1]:
!nvidia-smi

Wed Oct 23 02:22:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:86:00.0 Off |                  Off |
| 39%   63C    P2             343W / 450W |  24128MiB / 24564MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        Off | 00000000:AF:00.0 Off |  

In [4]:
import torch
print(torch.version.cuda)  # Check the CUDA version
print(torch.cuda.is_available())  # Check if CUDA is available

11.8
True


# Model

In [4]:
%%bash
cd /home/hoang/multi-stage-reranking/model/pre_trained_models
wget -q -O MiniLMv2-L6-H768-distilled-from-RoBERTa-Large.zip https://1ubnpq.bn.files.1drv.com/y4mIX6ParIAPno8mrrumh3CSQIi7cu5LzTBRWVS1jOO-2ddbEItW4EhjD_qg7R_KMjbekZcpfUHTLpwbOlv86gidJFbwMEkq4s8CDtNMDseDn1ebWmv5LDSUjXbEtg-a4DXlNKimn3hefuz6rewH199n8nGIxqtmPNHVzLwL052oq49bKW1rZv_yf2AWV6TgTP9CI2JWK9NwCyjIKQ__6AMow # MiniLM-L6-H768-distilled-from-RoBERTa-Large
unzip -q MiniLMv2-L6-H768-distilled-from-RoBERTa-Large.zip
rm MiniLMv2-L6-H768-distilled-from-RoBERTa-Large.zip
wget -q -O MiniLMv2-L6-H384-distilled-from-RoBERTa-Large.zip https://yeb8mw.bn.files.1drv.com/y4mbiQS6lq_n5yOdYnW5Bi7-Jw-yzU3p4WNPuIe4h1ejLyoDdfJTx9qOhHqowoH3zQ2FkmurdO1FY9igoXBv_s7yV2GcWSSnH-A4Gaa56_EoMM4FTDZY_x84k1lfOXekpEyjmVP49hCmo7D9agfuVpM5_TCKCIJKS9QVW5upX3RQ3cSjojOccfOtOl5iamlCpKTOwS94SZB7SuxcADsKvoGtQ # MiniLM-L6-H384-distilled-from-RoBERTa-Large
unzip -q MiniLMv2-L6-H384-distilled-from-RoBERTa-Large.zip
rm MiniLMv2-L6-H384-distilled-from-RoBERTa-Large.zip

In [1]:
!ls /home/hoang/multi-stage-reranking/model/pre_trained_models

MiniLM-L6-H384-distilled-from-RoBERTa-Large
MiniLM-L6-H768-distilled-from-RoBERTa-Large


# Dataset

## Download

In [3]:
%%bash
cd /home/hoang/multi-stage-reranking
source download_dataset.sh msmarco
source download_dataset.sh fiqa
source download_dataset.sh scifact
source download_dataset.sh hotpotqa

In [5]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/original

47M	/home/hoang/multi-stage-reranking/dataset/beir/original/fiqa
8.0M	/home/hoang/multi-stage-reranking/dataset/beir/original/scifact
3.4G	/home/hoang/multi-stage-reranking/dataset/beir/original/msmarco
2.1G	/home/hoang/multi-stage-reranking/dataset/beir/original/hotpotqa
5.5G	/home/hoang/multi-stage-reranking/dataset/beir/original


## Preprocess

In [6]:
%%bash
cd /home/hoang/multi-stage-reranking
source preprocess_dataset.sh msmarco
source preprocess_dataset.sh fiqa
source preprocess_dataset.sh scifact
source preprocess_dataset.sh hotpotqa

100%|██████████| 8841823/8841823 [2:57:57<00:00, 828.10it/s]   
100%|██████████| 509962/509962 [04:33<00:00, 1862.93it/s]
532751it [00:01, 308091.77it/s]
7437it [00:00, 399235.13it/s]
9260it [00:00, 668190.74it/s]


2024-10-09 06:20:19,742 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Setting log level to INFO
2024-10-09 06:20:19,744 INFO  [main] index.IndexCollection (IndexCollection.java:394) - Starting indexer...
2024-10-09 06:20:19,745 INFO  [main] index.IndexCollection (IndexCollection.java:396) - DocumentCollection path: dataset/beir/processed_bm25/msmarco/document_processed
2024-10-09 06:20:19,745 INFO  [main] index.IndexCollection (IndexCollection.java:397) - CollectionClass: JsonCollection
2024-10-09 06:20:19,745 INFO  [main] index.IndexCollection (IndexCollection.java:398) - Generator: DefaultLuceneDocumentGenerator
2024-10-09 06:20:19,746 INFO  [main] index.IndexCollection (IndexCollection.java:399) - Threads: 1
2024-10-09 06:20:19,746 INFO  [main] index.IndexCollection (IndexCollection.java:400) - Language: en
2024-10-09 06:20:19,746 INFO  [main] index.IndexCollection (IndexCollection.java:401) - Stemmer: porter
2024-10-09 06:20:19,747 INFO  [main] index.IndexCollecti

 61%|██████    | 307749/502939 [3:16:42<1:49:29, 29.71it/s] 

Error while terminating subprocess (pid=898409): 


In [6]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/processed

157M	/home/hoang/multi-stage-reranking/dataset/beir/processed/fiqa
20M	/home/hoang/multi-stage-reranking/dataset/beir/processed/scifact
18G	/home/hoang/multi-stage-reranking/dataset/beir/processed/msmarco
9.9G	/home/hoang/multi-stage-reranking/dataset/beir/processed/hotpotqa
28G	/home/hoang/multi-stage-reranking/dataset/beir/processed


In [1]:
!du -h -d 1 /home/hoang/multi-stage-reranking/dataset/beir/processed_bm25

58M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/fiqa
9.6M	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/scifact
4.3G	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/msmarco
2.0G	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25/hotpotqa
6.3G	/home/hoang/multi-stage-reranking/dataset/beir/processed_bm25


# Training

## Normal (pointwise) LM

### seed=0

In [2]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python train.py \
--model_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0 \
--tokenizer_name_or_path ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0 \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--train_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0 \
--num_train_epochs 2 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda --fp16

/home/hoang/multi-stage-reranking
10/23/2024 09:24:56 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/msmarco/document.json', id2query_path='dataset/beir/processed/msmarco/query.json', train_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/msmarco/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=2, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=True, ignore_index=-100, dat

In [5]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python train.py \
--model_name_or_path ./model/pre_trained_models/MiniLM-L6-H384-distilled-from-RoBERTa-Large \
--tokenizer_name_or_path FacebookAI/roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--train_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s0 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda

/home/hoang/multi-stage-reranking
10/19/2024 09:39:41 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/fiqa/document.json', id2query_path='dataset/beir/processed/fiqa/query.json', train_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s0', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=False, ignore_index=-100, data_size=100000

### seed=1

In [6]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python train.py \
--model_name_or_path ./model/pre_trained_models/MiniLM-L6-H384-distilled-from-RoBERTa-Large \
--tokenizer_name_or_path FacebookAI/roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--train_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s1 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 1 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda

/home/hoang/multi-stage-reranking
10/19/2024 09:49:57 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/fiqa/document.json', id2query_path='dataset/beir/processed/fiqa/query.json', train_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s1', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=1, data_seed=None, n_gpu=1, device='cuda', fp16=False, ignore_index=-100, data_size=100000

### seed=2

In [7]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python train.py \
--model_name_or_path ./model/pre_trained_models/MiniLM-L6-H384-distilled-from-RoBERTa-Large \
--tokenizer_name_or_path FacebookAI/roberta-large \
--do_train \
--task_type classification --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--train_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s2 \
--num_train_epochs 10 --learning_rate 5e-5 --seed 2 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda

/home/hoang/multi-stage-reranking
10/19/2024 09:56:01 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/fiqa/document.json', id2query_path='dataset/beir/processed/fiqa/query.json', train_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s2', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=10, eval_freq=1, seed=2, data_seed=None, n_gpu=1, device='cuda', fp16=False, ignore_index=-100, data_size=100000

## Pairwise LM

In [8]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python train.py \
--model_name_or_path ./model/pre_trained_models/MiniLM-L6-H384-distilled-from-RoBERTa-Large \
--tokenizer_name_or_path FacebookAI/roberta-large \
--do_train \
--task_type pairwise --negative_doc_cand_type all \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--train_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/train.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/dev.json \
--output_dir ./model/fine_tuned_models/MiniLM_L6_H384_fiqa_pairwise_all_e30_ns1_lr5e-5_s0 \
--num_train_epochs 30 --learning_rate 5e-5 --seed 0 \
--per_device_train_batch_size 16 --per_device_eval_batch_size 16 \
--per_device_generate_batch_size 16 --total_batch_size 64 \
--source_block_size 512 --n_gpu 1 --device cuda

/home/hoang/multi-stage-reranking
10/19/2024 10:05:13 - INFO - __main__ -   Training/evaluation parameters Namespace(id2doc_path='dataset/beir/processed/fiqa/document.json', id2query_path='dataset/beir/processed/fiqa/query.json', train_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/train.json', eval_query2doc_path='dataset/beir/processed_bm25/fiqa/qrels/dev.json', test_query2doc_path=None, source_block_size=512, target_block_size=128, local_rank=-1, output_dir='./model/fine_tuned_models/MiniLM_L6_H384_fiqa_pairwise_all_e30_ns1_lr5e-5_s0', do_train=True, do_eval=False, do_generate=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_device_generate_batch_size=16, total_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=0.1, num_train_epochs=30, eval_freq=1, seed=0, data_seed=None, n_gpu=1, device='cuda', fp16=False, ignore_index=-100, data_size=100000000000

# Evaluation

## only BM25

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python -u evaluate.py \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/test.json \
--use_bm25

## BM25 + Normal LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python -u evaluate.py \
--id2doc_path dataset/beir/processed/msmarco/document.json \
--id2query_path dataset/beir/processed/msmarco/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/msmarco/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 \
--source_block_size 512 \
--bert_task_type classification \
--use_bm25 --use_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python -u evaluate.py \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 \
--source_block_size 512 \
--bert_task_type classification \
--use_bm25 --use_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_msmarco_classification_all_e10_ns1_lr5e-5_s0

## BM25 + Normal LM + Ensemble

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python -u evaluate.py \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 --second_bert_num_candidate 10 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type classification \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s0 \
--second_model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s0 \
./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s1 \
./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s2

## BM25 + Normal LM + Pairwise LM

In [None]:
%cd /home/hoang/multi-stage-reranking
!CUDA_VISIBLE_DEVICES=1 python -u evaluate.py \
--id2doc_path dataset/beir/processed/fiqa/document.json \
--id2query_path dataset/beir/processed/fiqa/query.json \
--eval_query2doc_path dataset/beir/processed_bm25/fiqa/qrels/test.json \
--batch_size 16 \
--bert_num_candidate 100 --second_bert_num_candidate 10 \
--source_block_size 512 --second_source_block_size 512 \
--bert_task_type classification --second_bert_task_type pairwise \
--use_bm25 --use_bert --use_second_bert \
--model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_fiqa_classification_all_e10_ns1_lr5e-5_s0 \
--second_model_name_or_path \
./model/fine_tuned_models/MiniLM_L6_H384_fiqa_pairwise_all_e30_ns1_lr5e-5_s0