In [1]:
import os
import tarfile

import requests
%load_ext autoreload
%autoreload 2
files = [
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz",
        "name": "collection.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz",
        "name": "queries.tar.gz"
    },
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv",
        "name": "qrels.dev.tsv"
    }
]

for file in files:
    if not os.path.exists(file["name"].replace('.tar.gz', '.tsv')):
        response = requests.get(file["url"])
        with open(file["name"], 'wb') as f:
            f.write(response.content)
        if file["name"].endswith('.tar.gz'):
            with tarfile.open(file["name"], 'r:gz') as tar:
                tar.extractall(path='.')

  tar.extractall(path='.')


In [2]:
from load_corpus import read_collection, read_queries_dev
import pandas as pd
import os

merged_queries_csv_path = "merged_queries.csv"
queries = read_queries_dev()
qrels = pd.read_csv("qrels.dev.tsv", sep="\t", names=["qid","_","pid","rel"], dtype={"qid":str,"pid":str,"rel":int})

if os.path.exists(merged_queries_csv_path):
    merged_df = pd.read_csv(merged_queries_csv_path)
else:
    df = read_collection(limit=10000000)
    merged_df: pd.DataFrame = qrels[qrels["pid"].astype(str).isin(df["pid"].astype(str))].copy()

    merged_df = merged_df.sample(n=10000, random_state=42)
    merged_df["text"] = merged_df["pid"].map(df.set_index("pid")["text"])
    merged_df.to_csv("merged_queries.csv", index=False)

In [3]:
print(merged_df.shape)
print(merged_df.head())

(10000, 5)
       qid  _      pid  rel                                               text
0  1084031  0  7132043    1  Definition of constructivism - a style or move...
1   332830  0  5789735    1  You have to be 18 years old to get a tattoo. I...
2  1088785  0  7091207    1  WatchGuard Vista WatchGuard Vista: WatchGuard ...
3  1033718  0  7212203    1  More detail on some of the traits crops are ge...


In [4]:
from index_bm25 import build_bm25
import pandas as pd

bm25_df = pd.read_csv(merged_queries_csv_path)[["pid", "text"]]

build_bm25(bm25_df)

Indexing (Whoosh BM25): 100%|██████████| 10000/10000 [00:00<00:00, 11480.02it/s]


In [None]:
from bm25_metrics import evaluate_bm25
from load_corpus import read_queries_dev
import pandas as pd

merged_df = pd.read_csv(merged_queries_csv_path, dtype={"qid":str,"pid":str})

# We only keep queries whose qid appears in merged_df
all_queries = read_queries_dev().astype({"qid":str})
queries_eval = (all_queries[all_queries['qid'].isin(merged_df['qid'])]
                .drop_duplicates('qid')
                [['qid','query']])
print('queries_eval shape:', queries_eval.shape)

qrels_for_eval = merged_df[['qid','pid','rel']].astype({"qid":str,"pid":str,"rel":int})
sampled_queries = queries_eval.sample(n=1000, random_state=42)

metrics = evaluate_bm25(
    sampled_queries,
    qrels_for_eval,
    topk_run=1000,
    k_ndcg=10,
    k_map=10,
    k_rec=100
)
print('metrics:', metrics)

  from .autonotebook import tqdm as notebook_tqdm


queries_eval shape: (9867, 2)


Evaluating: 100%|██████████| 1000/1000 [00:03<00:00, 273.80q/s]

metrics: {'ndcg@10': 0.8149129216034285, 'map@10': 0.790613888888889, 'recall@100': 0.9525}



