# MonoBERT datasets compare

In [1]:
from src.datasets import MSMarcoDataset, PreProcessor
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import json
import time
import random
import pandas as pd
import os
from copy import deepcopy
import subprocess

# Metrics
from src.metrics import (
    mrr_score,
    map_score,
    mr_score,
    mf1_score,
    mndcg_score,
)

import pickle
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from src.rankers.ranker import Ranker
from src.datasets import MSMarcoDataset
from src.utils.cuda import get_device

In [2]:
index_folder = 'index'

In [3]:
# Load datasets

with open('data/temp_data/dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

with open('data/temp_data/dataset_lower.pkl', 'rb') as f:
    dataset_lower = pickle.load(f)

with open('data/temp_data/dataset_prepro.pkl', 'rb') as f:
    dataset_prepro = pickle.load(f)

docs_ids = list(dataset.documents.keys())

In [4]:
class MonoBERT(Ranker):

    def __init__(self, model_name: str, device: torch.device = None, use_amp: bool = False):
        self.device = get_device() if device is None else device
        self.use_amp = use_amp

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device).eval()

    def run(self, dataset: MSMarcoDataset, query_id: str, score_docs: list[tuple[str, float]], k: int = 10, **kwargs) -> list[tuple[str, float]]:
        query = dataset.queries[query_id]

        new_score_docs = []
        for doc_id, score in score_docs:
            inputs = self.tokenizer.encode_plus(
                query,
                dataset.documents[doc_id],
                max_length=512,
                truncation=True,
                return_token_type_ids=True,
                return_tensors="pt"
            )
            with torch.amp.autocast(enabled=self.use_amp, device_type=self.device.type):
                input_ids = inputs["input_ids"].to(self.device)
                token_type_ids = inputs["token_type_ids"].to(self.device)
                outputs = self.model(input_ids, token_type_ids=token_type_ids, return_dict=False)
                logits = outputs[0]

                if logits.size(1) > 1:
                    score = torch.nn.functional.log_softmax(logits, dim=1)[0, -1].item()
                else:
                    score = logits.item()
                
            new_score_docs.append((doc_id, score))
        
        return sorted(new_score_docs, key=lambda x: x[1], reverse=True)[:k]


In [5]:
class MonoBERT_pre_tokenization(Ranker):

    def __init__(self, model_name: str, device: torch.device = None, use_amp: bool = False):
        self.device = get_device() if device is None else device
        self.use_amp = use_amp

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device).eval()

        self.query_size = 64
        self.doc_size = 512 - self.query_size

        self.tokenized_docs = {}

    def run(self, dataset: MSMarcoDataset, query_id: str, score_docs: list[tuple[str, float]], k: int = 10, **kwargs) -> list[tuple[str, float]]:
        query = dataset.queries[query_id]

        new_score_docs = []
        for doc_id, score in score_docs:
            query_tokens = self.tokenizer.encode_plus(
                query,
                max_length=self.query_size,
                truncation=True,
                return_token_type_ids=False,
                return_tensors="pt"
            )["input_ids"].squeeze(0).to(self.device)

            input_ids = torch.cat([query_tokens, self.tokenized_docs[doc_id].to(self.device)], dim=0).unsqueeze(0)

            with torch.amp.autocast(enabled=self.use_amp, device_type=self.device.type):
                outputs = self.model(input_ids, return_dict=False)
                logits = outputs[0]

            if logits.size(1) > 1:
                score = torch.nn.functional.log_softmax(logits, dim=1)[0, -1].item()
            else:
                score = logits.item()
                
            new_score_docs.append((doc_id, score))
        
        return sorted(new_score_docs, key=lambda x: x[1], reverse=True)[:k]
    
    def pre_tokenize_docs(self, dataset: MSMarcoDataset):
        for doc_id, doc in tqdm(dataset.documents.items(), desc="Pre-tokenizing documents"):
            doc_tokens = self.tokenizer.encode_plus(
                dataset.documents[doc_id],
                max_length=self.doc_size,
                truncation=True,
                return_token_type_ids=False,
                return_tensors="pt"
            )
            self.tokenized_docs[doc_id] = doc_tokens["input_ids"].squeeze(0).to('cpu')


In [6]:
monobert = MonoBERT("castorini/monobert-large-msmarco")

monobert_pre_tokenization = MonoBERT_pre_tokenization("castorini/monobert-large-msmarco")
monobert_pre_tokenization_lower = MonoBERT_pre_tokenization("castorini/monobert-large-msmarco")
monobert_pre_tokenization_prepro = MonoBERT_pre_tokenization("castorini/monobert-large-msmarco")

In [7]:
monobert_pre_tokenization.pre_tokenize_docs(dataset)
monobert_pre_tokenization_lower.pre_tokenize_docs(dataset_lower)
monobert_pre_tokenization_prepro.pre_tokenize_docs(dataset_prepro)

Pre-tokenizing documents: 100%|██████████| 277168/277168 [03:50<00:00, 1203.61it/s]
Pre-tokenizing documents: 100%|██████████| 277168/277168 [03:48<00:00, 1212.23it/s]
Pre-tokenizing documents: 100%|██████████| 277168/277168 [03:11<00:00, 1451.10it/s]


In [8]:
results_bm25_folder = os.path.join('results', 'raw', 'bm25')
score_docs_bm25 = {}
for dataset_name in ['dataset', 'dataset_lower', 'dataset_prepro']:
    with open(os.path.join(results_bm25_folder, f'{dataset_name}_pyserini.json'), 'r') as f:
        score_docs_bm25[dataset_name] = json.load(f)['score_docs']

In [9]:
results_score_docs = {}
results_times = {}

k = 100

for dataset_name in ['dataset', 'dataset_lower', 'dataset_prepro']:
    score_docs = score_docs_bm25[dataset_name]
    results_score_docs[dataset_name] = {
        'monobert': {},
        'monobert_pre_tokenization': {},
    }
    results_times[dataset_name] = {
        'monobert': [],
        'monobert_pre_tokenization': [],
    }

    for query_id, score_docs_query in tqdm(score_docs.items(), desc=f"Ranking {dataset_name}"):

        # MonoBERT
        start_time = time.time()
        new_score_docs = monobert.run(dataset, query_id, score_docs_query, k=k)
        end_time = time.time()
        results_score_docs[dataset_name]['monobert'][query_id] = new_score_docs
        results_times[dataset_name]['monobert'].append(end_time - start_time)

        # MonoBERT pre-tokenization
        if dataset_name == 'dataset':
            monobert_pre = monobert_pre_tokenization
        elif dataset_name == 'dataset_lower':
            monobert_pre = monobert_pre_tokenization_lower
        else:
            monobert_pre = monobert_pre_tokenization_prepro

        start_time = time.time()
        new_score_docs_pre_tokenization = monobert_pre.run(dataset, query_id, score_docs_query, k=k)
        end_time = time.time()
        results_score_docs[dataset_name]['monobert_pre_tokenization'][query_id] = new_score_docs_pre_tokenization
        results_times[dataset_name]['monobert_pre_tokenization'].append(end_time - start_time)

Ranking dataset:   5%|▍         | 23/497 [14:32<5:04:46, 38.58s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Ranking dataset:   7%|▋         | 36/497 [22:53<4:52:35, 38.08s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Ranking dataset:   8%|▊         | 42/497 [26:48<4:59:07, 39.45s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Ranking dataset:  13%|█▎        | 63/497 [40:09<4:35:06, 38.03s/it]Be aware, overflowing tokens are not returned for the 

KeyboardInterrupt: 

In [10]:
results_monobert_folder = os.path.join('results', 'raw', 'monobert')
os.makedirs(results_monobert_folder, exist_ok=True)

for dataset_name in results_score_docs.keys():
    with open(os.path.join(results_monobert_folder, f'{dataset_name}_monobert.json'), 'w') as f:
        json.dump({
            'score_docs': results_score_docs[dataset_name]['monobert'],
            'times': results_times[dataset_name]['monobert']
        }, f)

    with open(os.path.join(results_monobert_folder, f'{dataset_name}_monobert_pre_tokenization.json'), 'w') as f:
        json.dump({
            'score_docs': results_score_docs[dataset_name]['monobert_pre_tokenization'],
            'times': results_times[dataset_name]['monobert_pre_tokenization']
        }, f)

In [15]:
k = 10
results_list = []
for dataset_name in results_score_docs.keys():
    # MonoBERT
    score_docs = results_score_docs[dataset_name]['monobert']
    times = results_times[dataset_name]['monobert']
    results_list.append({
        'name': 'monobert',
        'dataset': dataset_name,
        'mrr': mrr_score(score_docs, dataset.qrels, k=k),
        'map': map_score(score_docs, dataset.qrels, k=k),
        'mr': mr_score(score_docs, dataset.qrels, k=k),
        'mf1': mf1_score(score_docs, dataset.qrels, k=k),
        'mndcg': mndcg_score(score_docs, dataset.qrels, k=k),
        'avg_time': sum(times) / len(times),
        'time_std': pd.Series(times).std(),
        'time_max': max(times),
        'time_min': min(times),
    })

    # MonoBERT pre-tokenization
    score_docs = results_score_docs[dataset_name]['monobert_pre_tokenization']
    times = results_times[dataset_name]['monobert_pre_tokenization']
    results_list.append({
        'name': 'monobert_pre_tokenization',
        'dataset': dataset_name,
        'mrr': mrr_score(score_docs, dataset.qrels, k=k),
        'map': map_score(score_docs, dataset.qrels, k=k),
        'mr': mr_score(score_docs, dataset.qrels, k=k),
        'mf1': mf1_score(score_docs, dataset.qrels, k=k),
        'mndcg': mndcg_score(score_docs, dataset.qrels, k=k),
        'avg_time': sum(times) / len(times),
        'time_std': pd.Series(times).std(),
        'time_max': max(times),
        'time_min': min(times),
    })

results = pd.DataFrame(results_list)
results.to_csv(os.path.join(results_monobert_folder, 'metrics.csv'), index=False)
results.sort_values(by=['mrr']).style.format({
    'mrr': '{:.2f}',
    'map': '{:.2f}',
    'mr': '{:.2f}',
    'mf1': '{:.2f}',
    'mndcg': '{:.2f}',
    'avg_time': '{:.2f}',
    'max_time': '{:.2f}',
    'min_time': '{:.2f}',
    'std_time': '{:.2f}',
})

Unnamed: 0,name,dataset,mrr,map,mr,mf1,mndcg,avg_time,time_std,time_max,time_min
1,monobert_pre_tokenization,dataset,0.11,0.02,0.2,0.04,0.13,19.15,1.289897,22.247482,15.94538
0,monobert,dataset,0.71,0.08,0.78,0.14,0.72,18.96,1.304535,22.10372,15.920685
