# Basic Usage

In [5]:
# Commom imports
from tqdm import tqdm

## Dataset

In [1]:
# load data, tokenize and split
from src.datasets import MSMarcoDataset

dataset = MSMarcoDataset('data/subset_msmarco_train_0')
dataset.load_data('subset_msmarco_train_0.01_99.pkl')
dataset.split_data()

Loading queries: 100%|██████████| 2771/2771 [00:00<?, ?it/s]
Loading documents: 100%|██████████| 277168/277168 [00:00<00:00, 1374642.43it/s]
Loading qrels: 100%|██████████| 2845/2845 [00:00<00:00, 1416019.33it/s]


## Metrics

In [2]:
from src.metrics import (
    mrr_score,
    map_score,
    mr_score,
    mf1_score,
    mndcg_score,
)

def print_metrics(dataset: MSMarcoDataset, score_docs: list[tuple[str, float]]):
    print(f"MRR: {mrr_score(score_docs, dataset.qrels):.4f}")
    print(f"MAP: {map_score(score_docs, dataset.qrels):.4f}")
    print(f"MR: {mr_score(score_docs, dataset.qrels):.4f}")
    print(f"MF1: {mf1_score(score_docs, dataset.qrels):.4f}")
    print(f"MNDCG: {mndcg_score(score_docs, dataset.qrels):.4f}")

## Algorithms

### Retrivers

#### BM25

In [None]:
from src.retrivers.bm25 import BM25

bm25 = BM25(dataset)
query_id = '135841'

score_docs = bm25.run(dataset, query_id, 100)

for doc_id in dataset.qrels[query_id]:
    for i, (doc, score) in enumerate(score_docs):
        if doc_id == doc:
            print(f'{i+1}º: {doc}')
            break
score_docs[:10]

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\esdra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\esdra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Tokenizing documents: 100%|██████████| 277168/277168 [03:06<00:00, 1485.62doc/s]


True relevant documents for query 135841:


[('msmarco_passage_02_19009299', np.float64(25.129779619819196)),
 ('msmarco_passage_02_20739671', np.float64(24.998872014274685)),
 ('msmarco_passage_00_53431480', np.float64(22.077617845756684)),
 ('msmarco_passage_03_547575435', np.float64(21.896088861787916)),
 ('msmarco_passage_02_21092853', np.float64(21.72553747804953)),
 ('msmarco_passage_02_26827419', np.float64(21.72553747804953)),
 ('msmarco_passage_04_550536616', np.float64(21.274137190687796)),
 ('msmarco_passage_02_18968846', np.float64(21.180348316303736)),
 ('msmarco_passage_03_18532150', np.float64(20.789285569946582)),
 ('msmarco_passage_02_18753979', np.float64(19.193813304862065))]

In [6]:
# Evaluating BM25 with MRR
sorted_docs = {}
for query_id in tqdm(dataset.test_query_ids, desc="Evaluating BM25"):
    docs = bm25.run(dataset, query_id, 10)
    sorted_docs[query_id] = docs

print_metrics(dataset, sorted_docs)

Evaluating BM25: 100%|██████████| 555/555 [15:27<00:00,  1.67s/it]

MRR: 0.2391
MAP: 0.0377
MR: 0.3640
MF1: 0.0681
MNDCG: 0.2617





### Rerankers

#### MonoBERT

In [17]:
from src.rankers.monobert import MonoBERT

monobert = MonoBERT('castorini/monobert-large-msmarco')

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
sorted_docs_monobert = {}
for query_id, query_sorted_docs in tqdm(list(sorted_docs.items()), desc="Reranking with MonoBERT"):
    docs = monobert.run(dataset, query_id, query_sorted_docs)
    sorted_docs_monobert[query_id] = docs

print_metrics(dataset, sorted_docs_monobert)

Reranking with MonoBERT:  34%|███▍      | 189/555 [09:26<18:00,  2.95s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Reranking with MonoBERT:  48%|████▊     | 267/555 [13:22<14:23,  3.00s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Reranking with MonoBERT:  64%|██████▎   | 353/555 [17:45<10:05,  3.00s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Reranking with MonoBERT:  83%|████████▎ | 458/555 [23:04<04:50,  3.00s/it]Be aware, overflowing toke

MRR: 0.7382
MAP: 0.0863
MR: 0.8279
MF1: 0.1559
MNDCG: 0.7460



