In [8]:
import transformers
from pathlib import Path
import pandas as pd
import ir_datasets
import sys
sys.path.append("..")
from sparse_cross_encoder.data.ir_dataset_utils import DASHED_DATASET_MAP, get_base
from tqdm import tqdm
from collections import defaultdict

In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [27]:
token_lengths = {}

In [55]:
num_queries = defaultdict(int)
num_judgments = defaultdict(int)
token_lengths = defaultdict(list)

for run_file in tqdm(list(Path("../data/baseline-runs/bm25/").glob("*.jsonl.gz"))):
    dataset_name = run_file.name.split(".")[0]
    dataset = ir_datasets.load(DASHED_DATASET_MAP[dataset_name])
    base = get_base(dataset.dataset_id())
    if base not in token_lengths:
        run = pd.read_json(run_file.resolve(), lines=True)
        run = run.groupby("qid").head(100)
        tokens = tokenizer(run["text"].values.tolist())
        lengths = [len(t) for t in tokens["input_ids"]]
        token_lengths[base] = lengths
    num_queries[base] += dataset.queries_count()
    num_judgments[base] += dataset.qrels_count()

data = []
out_of_domain_lengths = []
out_of_domain_queries = 0
out_of_domain_judgments = 0
for base in token_lengths:
    average_length = sum(token_lengths[base]) / len(token_lengths[base])
    data.append((base, average_length, num_queries[base], num_judgments[base]))
    if "msmarco" not in base:
        out_of_domain_lengths.extend(token_lengths[base])
        out_of_domain_queries += num_queries[base]
        out_of_domain_judgments += num_judgments[base]
data.append(["out-of-domain", sum(out_of_domain_lengths) / len(out_of_domain_lengths), out_of_domain_queries, out_of_domain_judgments])


df = pd.DataFrame(data, columns=["dataset", "average_length", "num_queries", "num_judgments"])
medline = df.loc[df["dataset"].str.contains("medline")].sum()
medline["average_length"] = sum(token_lengths["medline-2004"] + token_lengths["medline-2005"]) / len(token_lengths["medline-2004"] + token_lengths["medline-2005"])
medline["dataset"] = "medline"
df = df.drop(df.loc[df["dataset"].str.contains("medline")].index)
df = pd.concat([df, medline.to_frame().T])
df["average_judgements"] = df["num_judgments"] / df["num_queries"]
df = df.sort_values("dataset")[["dataset", "average_length", "num_queries", "average_judgements"]].reset_index(drop=True).infer_objects()
df = df.round({"average_length": 1, "average_judgements": 1})
df

Unnamed: 0,dataset,average_length,num_queries,average_judgements
0,antique,49.9,200,32.9
1,argsme-2020-04-01,435.5,99,60.7
2,clueweb09,1132.6,200,421.8
3,clueweb12,5641.7,200,163.8
4,cord19,3647.7,50,1386.4
5,cranfield,234.8,225,8.2
6,disks45-nocr,749.3,350,1367.4
7,gov,2700.5,325,603.9
8,gov2,2410.3,150,902.3
9,medline,309.1,180,518.3


In [58]:
print(df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
          dataset &  average\_length &  num\_queries &  average\_judgements \\
\midrule
          antique &            49.9 &          200 &                32.9 \\
argsme-2020-04-01 &           435.5 &           99 &                60.7 \\
        clueweb09 &          1132.6 &          200 &               421.8 \\
        clueweb12 &          5641.7 &          200 &               163.8 \\
           cord19 &          3647.7 &           50 &              1386.4 \\
        cranfield &           234.8 &          225 &                 8.2 \\
     disks45-nocr &           749.3 &          350 &              1367.4 \\
              gov &          2700.5 &          325 &               603.9 \\
             gov2 &          2410.3 &          150 &               902.3 \\
          medline &           309.1 &          180 &               518.3 \\
  msmarco-passage &            77.1 &           97 &               212.8 \\
         nfcorpus &           364.6 &        

  print(df.to_latex(index=False))
