In [2]:
import os
import tarfile

import requests
%load_ext autoreload
%autoreload 2
files = [
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz",
        "name": "collection.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz",
        "name": "queries.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv",
        "name": "qrels.dev.tsv"
    }
]

for file in files:
    if not os.path.exists(file["name"].replace('.tar.gz', '.tsv')):
        response = requests.get(file["url"])
        with open(file["name"], 'wb') as f:
            f.write(response.content)
        if file["name"].endswith('.tar.gz'):
            with tarfile.open(file["name"], 'r:gz') as tar:
                tar.extractall(path='.')

  tar.extractall(path='.')


In [3]:
from load_corpus import read_collection, read_queries_dev
import pandas as pd

df = read_collection(limit=1000000)
queries = read_queries_dev()
qrels = pd.read_csv("qrels.dev.tsv", sep="\t", names=["qid","_","pid","rel"], dtype={"qid":str,"pid":str,"rel":int})
queries_with_pids = queries.merge(qrels[["qid","pid"]], on="qid", how="inner").drop_duplicates(["qid","pid"])

merged_df: pd.DataFrame = queries_with_pids[queries_with_pids["pid"].astype(str).isin(df["pid"].astype(str))].copy()
merged_df.head()

Reading collection: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:02<00:00, 453496.17it/s]


Unnamed: 0,qid,query,pid
11,524318,treating diabetes,132459
12,524332,treating tension headaches without medication,740662
15,1048625,what is parkland near in florida,143101
60,184,+is biology a social science,765726
72,1048811,what is organic insomnia,465790


In [24]:
from index_bm25 import build_bm25

build_bm25(df)

Indexing (Whoosh BM25): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [06:36<00:00, 2522.13it/s]


In [4]:
from bm25_metrics import evaluate_bm25_in_memory, qrels_df_to_dict

queries = queries[queries["qid"].isin(merged_df["qid"])].sample(n=1000, random_state=42)
qrels = qrels[qrels["qid"].isin(merged_df["qid"]) & qrels["pid"].isin(merged_df["pid"])]

# We need to ensure topk_run >= k_rec to make recall@k meaningful
metrics = evaluate_bm25_in_memory(queries, qrels_df_to_dict(qrels), topk_run=1000, k_ndcg=10, k_map=10, k_rec=100)
print(metrics)

Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:42<00:00,  4.50q/s]

{'ndcg@10': 0.17415298146754304, 'map@10': 0.1332015873015873, 'recall@100': 0.656}



