# Basic Usage

In [1]:
# Commom imports
from tqdm import tqdm

## Dataset

In [2]:
# load data, tokenize and split
from src.datasets import MSMarcoDataset

dataset = MSMarcoDataset('data/subset_msmarco_train_0')
dataset.load_data('subset_msmarco_train_0.01_99.pkl')
dataset.split_data()

Loading queries: 100%|██████████| 2771/2771 [00:00<00:00, 5122263.72it/s]
Loading documents: 100%|██████████| 277168/277168 [00:00<00:00, 3864423.69it/s]
Loading qrels: 100%|██████████| 2845/2845 [00:00<00:00, 2581179.94it/s]


## Metrics

In [3]:
from src.metrics import (
    mrr_score,
    map_score,
    mr_score,
    mf1_score,
    mndcg_score,
)

def print_metrics(dataset: MSMarcoDataset, score_docs: list[tuple[str, float]]):
    print(f"MRR: {mrr_score(score_docs, dataset.qrels):.4f}")
    print(f"MAP: {map_score(score_docs, dataset.qrels):.4f}")
    print(f"MR: {mr_score(score_docs, dataset.qrels):.4f}")
    print(f"MF1: {mf1_score(score_docs, dataset.qrels):.4f}")
    print(f"MNDCG: {mndcg_score(score_docs, dataset.qrels):.4f}")

## Algorithms

### Retrivers

#### BM25

In [4]:
from src.retrivers.bm25 import BM25

bm25 = BM25(dataset)
query_id = '135841'

score_docs = bm25.run(dataset, query_id, 100)

for doc_id in dataset.qrels[query_id]:
    for i, (doc, score) in enumerate(score_docs):
        if doc_id == doc:
            print(f'{i+1}º: {doc}')
            break
score_docs[:10]

[nltk_data] Downloading package punkt_tab to /Users/masfz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/masfz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/masfz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/masfz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Process SpawnPoolWorker-1:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/V

KeyboardInterrupt: 

In [6]:
# Evaluating BM25 with MRR
sorted_docs = {}
for query_id in tqdm(dataset.test_query_ids, desc="Evaluating BM25"):
    docs = bm25.run(dataset, query_id, 10)
    sorted_docs[query_id] = docs

print_metrics(dataset, sorted_docs)

Evaluating BM25: 100%|██████████| 555/555 [15:27<00:00,  1.67s/it]

MRR: 0.2391
MAP: 0.0377
MR: 0.3640
MF1: 0.0681
MNDCG: 0.2617





### Rerankers

#### MonoBERT

In [17]:
from src.rankers.monobert import MonoBERT

monobert = MonoBERT('castorini/monobert-large-msmarco')

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
# sorted_docs_monobert = {}
# for query_id, query_sorted_docs in tqdm(list(sorted_docs.items()), desc="Reranking with MonoBERT"):
#     docs = monobert.run(dataset, query_id, query_sorted_docs)
#     sorted_docs_monobert[query_id] = docs

# print_metrics(dataset, sorted_docs_monobert)

Reranking with MonoBERT:  34%|███▍      | 189/555 [09:26<18:00,  2.95s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Reranking with MonoBERT:  48%|████▊     | 267/555 [13:22<14:23,  3.00s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Reranking with MonoBERT:  64%|██████▎   | 353/555 [17:45<10:05,  3.00s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Reranking with MonoBERT:  83%|████████▎ | 458/555 [23:04<04:50,  3.00s/it]Be aware, overflowing toke

MRR: 0.7382
MAP: 0.0863
MR: 0.8279
MF1: 0.1559
MNDCG: 0.7460





In [None]:
from src.retrivers.vespa import VespaRetriever
from tqdm import tqdm

vespa = VespaRetriever(endpoint="http://localhost:8080/search/")

sorted_docs_vespa = {}
for qid in tqdm(dataset.test_query_ids, desc="Vespa BM25"):
    hits = vespa.run(dataset, qid, k=10)
    sorted_docs_vespa[qid] = hits

print("=== Métricas Vespa BM25 ===")
print_metrics(dataset, sorted_docs_vespa)
