In [11]:
import pprint
import tqdm
from datasets import load_dataset

ds = load_dataset("hotpotqa/hotpot_qa", name="distractor", split="validation")
print(ds)
print(ds.column_names)
pprint.pprint(ds[0])

Dataset({
    features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
    num_rows: 7405
})
['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context']
{'answer': 'yes',
 'context': {'sentences': [['Ed Wood is a 1994 American biographical period '
                            'comedy-drama film directed and produced by Tim '
                            'Burton, and starring Johnny Depp as cult '
                            'filmmaker Ed Wood.',
                            " The film concerns the period in Wood's life when "
                            'he made his best-known films as well as his '
                            'relationship with actor Bela Lugosi, played by '
                            'Martin Landau.',
                            ' Sarah Jessica Parker, Patricia Arquette, Jeffrey '
                            'Jones, Lisa Marie, and Bill Murray are among the '
                            'supporting cast.'],
             

In [25]:

from collections import defaultdict

lengths = defaultdict(lambda: 0)

for example in ds:
    assert "question" in example
    lengths[len(example["context"]["sentences"])] += 1

pprint.pprint(lengths)

assert sum(lengths.values()) == 7405 # From the paper Table 1: https://arxiv.org/pdf/1809.09600

print("\n".join(ds[0]["context"]["sentences"][0]))


defaultdict(<function <lambda> at 0x3200b6020>,
            {2: 21,
             3: 7,
             4: 9,
             5: 6,
             6: 8,
             7: 2,
             8: 4,
             9: 3,
             10: 7345})
Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.
 The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.
 Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.


## BeIR subset

Microsoft's E5 paper used six datasets within BeIR for more in depth evaluation. We considered using these six, and they do load well. However, further analysis resulted in excluding these as they do not match the RAG scenario we are most interested in. 

In [47]:
e5_datasets = ["nfcorpus","nq","fiqa","quora","dbpedia","scifact"]

from datasets import load_dataset

for name in e5_datasets:
    ds = load_dataset(path=f"mteb/{name}", name="corpus")
    print(ds)

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 3633
    })
})


Downloading data: 100%|██████████| 1.46G/1.46G [01:12<00:00, 20.1MB/s]
Generating corpus split: 100%|██████████| 2681468/2681468 [00:01<00:00, 2096549.57 examples/s]


DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 2681468
    })
})


Downloading data: 100%|██████████| 47.0M/47.0M [00:03<00:00, 15.6MB/s]
Generating corpus split: 100%|██████████| 57638/57638 [00:00<00:00, 1101056.62 examples/s]


DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 57638
    })
})


Downloading data: 100%|██████████| 55.0M/55.0M [00:03<00:00, 17.8MB/s]
Generating corpus split: 100%|██████████| 522931/522931 [00:00<00:00, 5985938.27 examples/s]


DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 522931
    })
})


Downloading data: 100%|██████████| 1.79G/1.79G [01:27<00:00, 20.4MB/s]
Generating corpus split: 100%|██████████| 4635922/4635922 [00:01<00:00, 2896964.49 examples/s]


DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 4635922
    })
})


Downloading data: 100%|██████████| 8.02M/8.02M [00:00<00:00, 11.7MB/s]
Generating corpus split: 100%|██████████| 5183/5183 [00:00<00:00, 468807.61 examples/s]

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 5183
    })
})





## BeIR and MTEB datasets are difficult to reproduce.

### BeIR via HuggingFace
There are the 19 datasets in BeIR. Four of them (commented out below) require manual steps including scraping Twitter. Attempting to load the first listed dataset, msmarco, triggers a ValueError. The ValueError message seems to indicate that only two (of 19) datasets are available.


In [29]:
beir_names = [
    "msmarco",
    "trec-covid",
    "nfcorpus",
    # "bioasq",
    "nq",
    "hotpotqa",
    "fiqa",
    # "signal1m",
    # "trec-news",
    "arguana",
    "webis-touche2020",
    "cqadupstack",
    "quora",
    "dbpedia-entity",
    "scidocs",
    "fever",
    "climate-fever",
    "scifact",
    # "robust04",
]

import datasets
print(f"datasets library version: {datasets.__version__}")

from datasets import load_dataset

try:
    builder = load_dataset("BeIR/beir", name="msmarco")
except ValueError as e:
    print(e)

datasets library version: 2.20.0
BuilderConfig 'msmarco' not found. Available: ['fiqa', 'trec-covid', '']


### BeIR via the researcher's website is also difficult.

Trying to run their script results in timeouts. 

In [30]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

for name in beir_names:
    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(name)
    data_path = util.download_and_unzip(url, "datasets")
    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
    print(name, len(corpus), len(queries), len(qrels))

datasets/msmarco.zip:   3%|▎         | 30.6M/1.01G [00:04<02:19, 7.54MiB/s] 


KeyboardInterrupt: 