In [None]:
!pip install tensorflow tensorflow_ranking numpy tqdm
import csv
import gzip
import pandas as pd
import tensorflow as tf
import tensorflow_ranking as tfr
import numpy as np
import tqdm

In [None]:
!wget https://storage.googleapis.com/gresearch/rd-suite/rd_suite_test.zip
!unzip /content/rd_suite_test.zip -d /content

In [4]:
import os

root_dir = "/content/rd_benchmark"
name_to_paths = {}
for dataset_name in os.listdir(root_dir):
  dataset_path = os.path.join(root_dir, dataset_name)
  for split in os.listdir(dataset_path):
    full_path = os.path.join(dataset_path, split)
    name_to_paths[f"{dataset_name}_{split}"] = [
        os.path.join(full_path, "trec_qrel.txt"),
        os.path.join(full_path, "trec_run.txt"),
    ]
name_to_paths

{'msmarco_dev': ['/content/rd_benchmark/msmarco/dev/trec_qrel.txt',
  '/content/rd_benchmark/msmarco/dev/trec_run.txt'],
 'istella_test': ['/content/rd_benchmark/istella/test/trec_qrel.txt',
  '/content/rd_benchmark/istella/test/trec_run.txt'],
 'nq_dev_msmarco_teacher': ['/content/rd_benchmark/nq/dev_msmarco_teacher/trec_qrel.txt',
  '/content/rd_benchmark/nq/dev_msmarco_teacher/trec_run.txt'],
 'nq_dev': ['/content/rd_benchmark/nq/dev/trec_qrel.txt',
  '/content/rd_benchmark/nq/dev/trec_run.txt'],
 'web30k_test': ['/content/rd_benchmark/web30k/test/trec_qrel.txt',
  '/content/rd_benchmark/web30k/test/trec_run.txt']}

# Metrics

In [5]:
def read_dataset(qrel_path: str, run_path: str) -> tf.data.Dataset:
  """Reads an eval dataset from CSV to tf.data.Dataset."""
  qrel = pd.read_csv(
      qrel_path, sep="\s+", names=["query_id", "0", "doc_id", "relevance"]
  )
  run = pd.read_csv(
      run_path,
      sep="\s+",
      names=["query_id", "q0", "doc_id", "rank", "score", "run_id"],
  )
  merged = qrel.merge(run, on=["query_id", "doc_id"])
  sizes = merged.groupby("query_id", sort=False).size().values

  scores = tf.RaggedTensor.from_row_lengths(
      merged["score"].astype(float), sizes
  )
  labels = tf.RaggedTensor.from_row_lengths(
      merged["relevance"].astype(float), sizes
  )

  ds = tf.data.Dataset.from_tensor_slices({"scores": scores, "labels": labels})
  return ds.map(lambda e: tf.nest.map_structure(lambda x: x, e))

In [6]:
class BinaryMRRMetric(tf.keras.metrics.Metric):

  def __init__(self, threshold, name="binary_mrr", topn=None, **kwargs):
    super().__init__(name=name, **kwargs)
    self._threshold = threshold
    self._mrr_metric = tfr.keras.metrics.MRRMetric(topn=topn)

  def update_state(self, y_true, y_pred, sample_weight=None):
    self._mrr_metric.update_state(
        tf.cast(y_true >= self._threshold, y_true.dtype),
        y_pred,
        sample_weight=sample_weight,
    )

  def result(self):
    return self._mrr_metric.result()


def eval_dataset(ds: tf.data.Dataset, metrics: dict) -> dict:
  padding_values = {"scores": np.float64('-inf'), "labels": np.float64(-1.0)}
  ds = ds.padded_batch(128, padding_values=padding_values)

  # Do eval.
  for batch in tqdm.tqdm(ds):
    for metric in metrics.values():
      metric.update_state(batch["labels"], batch["scores"])

  # Show results.
  return {
      name: f"{100.0 * metric.result():.2f}" for name, metric in metrics.items()
  }

In [7]:
metrics = eval_dataset(
    read_dataset(
        "/content/rd_benchmark/nq/dev/trec_qrel.txt",
        "/content/rd_benchmark/nq/dev/trec_run.txt",
    ),
    metrics={
        "mrr@10": tfr.keras.metrics.MRRMetric(topn=10),
        "mrr": tfr.keras.metrics.MRRMetric(),
        "ndcg@1": tfr.keras.metrics.NDCGMetric(topn=1),
        "ndcg@5": tfr.keras.metrics.NDCGMetric(topn=5),
        "ndcg": tfr.keras.metrics.NDCGMetric(),
    },
)
pd.DataFrame.from_dict({"nq_dev_teacher": metrics}, orient="index")

100%|██████████| 51/51 [00:06<00:00,  7.86it/s]


Unnamed: 0,mrr@10,mrr,ndcg@1,ndcg@5,ndcg
nq_dev_teacher,60.08,60.36,46.94,63.89,66.98


In [8]:
metrics = eval_dataset(
    read_dataset(
        "/content/rd_benchmark/nq/dev_msmarco_teacher/trec_qrel.txt",
        "/content/rd_benchmark/nq/dev_msmarco_teacher/trec_run.txt",
    ),
    metrics={
        "mrr@10": tfr.keras.metrics.MRRMetric(topn=10),
        "mrr": tfr.keras.metrics.MRRMetric(),
        "ndcg@1": tfr.keras.metrics.NDCGMetric(topn=1),
        "ndcg@5": tfr.keras.metrics.NDCGMetric(topn=5),
        "ndcg": tfr.keras.metrics.NDCGMetric(),
    },
)
pd.DataFrame.from_dict({"nq_dev_msmarco_teacher": metrics}, orient="index")

100%|██████████| 51/51 [00:05<00:00,  9.97it/s]


Unnamed: 0,mrr@10,mrr,ndcg@1,ndcg@5,ndcg
nq_dev_msmarco_teacher,45.27,46.02,31.08,48.94,55.49


In [9]:
metrics = eval_dataset(
    read_dataset(
        "/content/rd_benchmark/msmarco/dev/trec_qrel.txt",
        "/content/rd_benchmark/msmarco/dev/trec_run.txt",
    ),
    metrics={
        "mrr@10": tfr.keras.metrics.MRRMetric(topn=10),
        "mrr": tfr.keras.metrics.MRRMetric(),
        "ndcg@1": tfr.keras.metrics.NDCGMetric(topn=1),
        "ndcg@5": tfr.keras.metrics.NDCGMetric(topn=5),
        "ndcg": tfr.keras.metrics.NDCGMetric(),
    },
)
pd.DataFrame.from_dict({"msmarco_dev_teacher": metrics}, orient="index")

100%|██████████| 55/55 [00:05<00:00, 10.56it/s]


Unnamed: 0,mrr@10,mrr,ndcg@1,ndcg@5,ndcg
msmarco_dev_teacher,43.63,44.46,29.93,46.85,54.22


In [10]:
metrics = eval_dataset(
    read_dataset(
        "/content/rd_benchmark/web30k/test/trec_qrel.txt",
        "/content/rd_benchmark/web30k/test/trec_run.txt",
    ),
    metrics={
        "mrr[rel >= 3.0]@10": BinaryMRRMetric(threshold=3, topn=10),
        "mrr[rel >= 3.0]": BinaryMRRMetric(threshold=3),
        "ndcg@1": tfr.keras.metrics.NDCGMetric(topn=1),
        "ndcg@5": tfr.keras.metrics.NDCGMetric(topn=5),
        "ndcg": tfr.keras.metrics.NDCGMetric(),
    },
)
pd.DataFrame.from_dict({"web30k_test_teacher": metrics}, orient="index")

100%|██████████| 50/50 [00:09<00:00,  5.53it/s]


Unnamed: 0,mrr[rel >= 3.0]@10,mrr[rel >= 3.0],ndcg@1,ndcg@5,ndcg
web30k_test_teacher,34.43,35.05,48.39,47.33,71.62


In [11]:
metrics = eval_dataset(
    read_dataset(
        "/content/rd_benchmark/istella/test/trec_qrel.txt",
        "/content/rd_benchmark/istella/test/trec_run.txt",
    ),
    metrics={
        "mrr[rel >= 3.0]@10": BinaryMRRMetric(threshold=3, topn=10),
        "mrr[rel >= 3.0]": BinaryMRRMetric(threshold=3),
        "ndcg@1": tfr.keras.metrics.NDCGMetric(topn=1),
        "ndcg@5": tfr.keras.metrics.NDCGMetric(topn=5),
        "ndcg": tfr.keras.metrics.NDCGMetric(),
    },
)
pd.DataFrame.from_dict({"istella_test_teacher": metrics}, orient="index")

100%|██████████| 77/77 [00:13<00:00,  5.88it/s]


Unnamed: 0,mrr[rel >= 3.0]@10,mrr[rel >= 3.0],ndcg@1,ndcg@5,ndcg
istella_test_teacher,84.54,84.59,71.69,67.92,81.69
