In [1]:
import os
import tarfile

import requests
%load_ext autoreload
%autoreload 2
files = [
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz",
        "name": "collection.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz",
        "name": "queries.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv",
        "name": "qrels.dev.tsv"
    }
]

for file in files:
    if not os.path.exists(file["name"].replace('.tar.gz', '.tsv')):
        response = requests.get(file["url"])
        with open(file["name"], 'wb') as f:
            f.write(response.content)
        if file["name"].endswith('.tar.gz'):
            with tarfile.open(file["name"], 'r:gz') as tar:
                tar.extractall(path='.')

  tar.extractall(path='.')


In [2]:
import pandas as pd

merged_queries_csv_path = "common_dataset.tsv"
queries = pd.read_csv("queries.dev.tsv", sep="\t", names=["qid", "query"], dtype={"qid": str, "query": str})
qrels = pd.read_csv("qrels.dev.tsv", sep="\t", names=["qid","_","pid","rel"], dtype={"qid":str,"pid":str,"rel":int})

merged_df = pd.read_csv(merged_queries_csv_path, sep="\t", names=["pid", "text"], dtype={"pid": str, "text":str})

In [3]:
print(merged_df.shape)
print(merged_df.head())

(60000, 2)
    pid                                               text
0   448  A postal code (also known locally in various E...
1   466  Therefore, all pathologists must have complete...
2   646  Obesity is a complex disorder involving an exc...
3  1212  Which president appointed FBI Director James C...
4  1213  Comey was confirmed by the Senate on July 29, ...


In [4]:
%%time
from index_bm25 import build_bm25
import pandas as pd

build_bm25(merged_df)

Indexing (Whoosh BM25): 100%|██████████| 60000/60000 [00:06<00:00, 9960.72it/s] 


CPU times: user 28.9 s, sys: 2.31 s, total: 31.2 s
Wall time: 40 s


In [5]:
%%time
from eval_metrics import evaluate_bm25
from load_corpus import read_queries_dev
import pandas as pd

filtered_qrels = qrels[qrels['pid'].isin(merged_df['pid'])]
queries_eval = (queries[queries['qid'].isin(filtered_qrels['qid'])]
                .drop_duplicates('qid')
                [['qid','query']])

qrels_for_eval = filtered_qrels[['qid','pid','rel']].astype({"qid":str,"pid":str,"rel":int})
sampled_queries = queries_eval.sample(n=1000, random_state=42)

metrics = evaluate_bm25(
    sampled_queries,
    qrels_for_eval,
    topk_run=10,
    k_ndcg=10,
    k_map=10,
    k_rec=10
)
print('metrics:', metrics)

  from .autonotebook import tqdm as notebook_tqdm


Evaluating: 100%|██████████| 1000/1000 [00:08<00:00, 113.62q/s]

metrics: {'ndcg@10': 0.7203459772321538, 'map@10': 0.6829388888888889, 'recall@10': 0.8295, 'num_queries': 1000}
CPU times: user 11.7 s, sys: 76.3 ms, total: 11.7 s
Wall time: 11.7 s



