In [1]:
import os
import tarfile

import requests
%load_ext autoreload
%autoreload 2
files = [
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz",
        "name": "collection.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz",
        "name": "queries.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv",
        "name": "qrels.dev.tsv"
    }
]

for file in files:
    if not os.path.exists(file["name"].replace('.tar.gz', '.tsv')):
        response = requests.get(file["url"])
        with open(file["name"], 'wb') as f:
            f.write(response.content)
        if file["name"].endswith('.tar.gz'):
            with tarfile.open(file["name"], 'r:gz') as tar:
                tar.extractall(path='.')

  tar.extractall(path='.')


In [12]:
from load_corpus import read_collection, read_queries_dev
import pandas as pd
import os

merged_queries_csv_path = "common_dataset.tsv"
queries = read_queries_dev()
qrels = pd.read_csv("qrels.dev.tsv", sep="\t", names=["qid","_","pid","rel"], dtype={"qid":str,"pid":str,"rel":int})

if os.path.exists(merged_queries_csv_path):
    merged_df = pd.read_csv(merged_queries_csv_path, sep="\t", names=["pid", "text"], dtype={"pid": str, "text":str})
else:
    df = read_collection(limit=10000000)
    merged_df: pd.DataFrame = qrels[qrels["pid"].astype(str).isin(df["pid"].astype(str))].copy()

    merged_df = merged_df.sample(n=10000, random_state=42)
    merged_df["text"] = merged_df["pid"].map(df.set_index("pid")["text"])
    merged_df.to_csv("merged_queries.csv", index=False)

In [13]:
print(merged_df.shape)
print(merged_df.head())

(40000, 2)
    pid                                               text
0   448  A postal code (also known locally in various E...
1   466  Therefore, all pathologists must have complete...
2   646  Obesity is a complex disorder involving an exc...
3  1212  Which president appointed FBI Director James C...
4  1213  Comey was confirmed by the Senate on July 29, ...


In [14]:
%%time
from index_bm25 import build_bm25
import pandas as pd

build_bm25(merged_df)

Indexing (Whoosh BM25):   5%|▍         | 1988/40000 [00:00<00:03, 10529.02it/s]

Indexing (Whoosh BM25): 100%|██████████| 40000/40000 [00:03<00:00, 10853.67it/s]


CPU times: user 20.3 s, sys: 1.45 s, total: 21.8 s
Wall time: 26.3 s


In [15]:
%%time
from bm25_metrics import evaluate_bm25
from load_corpus import read_queries_dev
import pandas as pd

# We only keep queries whose qid appears in merged_df
all_queries = read_queries_dev().astype({"qid":str})
merged_queries_df: pd.DataFrame = qrels[qrels["pid"].astype(str).isin(merged_df["pid"].astype(str))].copy()
queries_eval = (all_queries[all_queries['qid'].isin(merged_queries_df['qid'])]
                .drop_duplicates('qid')
                [['qid','query']])
print('queries_eval shape:', queries_eval.shape)

qrels_for_eval = merged_queries_df[['qid','pid','rel']].astype({"qid":str,"pid":str,"rel":int})
sampled_queries = queries_eval.sample(n=1000, random_state=42)

metrics = evaluate_bm25(
    sampled_queries,
    qrels_for_eval,
    topk_run=1000,
    k_ndcg=10,
    k_map=10,
    k_rec=100
)
print('metrics:', metrics)

queries_eval shape: (19229, 2)


Evaluating: 100%|██████████| 1000/1000 [00:13<00:00, 75.69q/s]

metrics: {'ndcg@10': 0.6869677779749187, 'map@10': 0.6453674603174603, 'recall@100': 0.9203333333333333}
CPU times: user 14.4 s, sys: 75.6 ms, total: 14.4 s
Wall time: 14.4 s



