In [1]:
! pip install beir==2.0.0

Collecting beir==2.0.0
  Downloading beir-2.0.0.tar.gz (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytrec_eval (from beir==2.0.0)
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss_cpu (from beir==2.0.0)
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting elasticsearch==7.9.1 (from beir==2.0.0)
  Downloading elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers->beir==2.0.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from to

In [2]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

  from tqdm.autonotebook import tqdm


In [20]:
# Download and load dataset
dataset = "trec-covid" # dataset name
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
out_dir = "datasets"
data_path = util.download_and_unzip(url, out_dir)

datasets/trec-covid.zip:   0%|          | 0.00/70.5M [00:00<?, ?iB/s]

In [21]:
# Load corpus, queries, and qrels
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/171332 [00:00<?, ?it/s]

In [22]:
import json
# load paraphrased dataset
with open(f"{dataset}_query_paraphrased_gpt4o.json", encoding='utf-8') as f:
    # Load the JSON data into a Python dictionary
    queries_para = json.load(f)

In [23]:
for q in queries_para:
  queries[q] = queries_para[q]['query_p']

In [24]:
import numpy as np
from collections import defaultdict
import string

def preprocess(text):
    # Basic normalization: lowercasing, punctuation removal, tokenization
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text.split()

def compute_stats(corpus, queries, qrels):
    # Number of documents
    num_docs = len(corpus)

    # Number of queries (use only those present in qrels)
    num_queries = len(set(qrels.keys()))

    # Average document length (in words)
    doc_lengths = [len(preprocess(doc_data['text'])) for doc_data in corpus.values()]
    avg_doc_length = np.mean(doc_lengths)

    # Average query length (in words)
    query_lengths = [len(preprocess(queries[qid])) for qid in qrels if qid in queries]
    avg_query_length = np.mean(query_lengths)

    # Number of unique words (in corpus only)
    vocab = set()
    for doc_data in corpus.values():
        vocab.update(preprocess(doc_data['text']))
    num_unique_words = len(vocab)

    return {
        "num_docs": num_docs,
        "num_queries": num_queries,
        "avg_doc_length": avg_doc_length,
        "avg_query_length": avg_query_length,
        "num_unique_words": num_unique_words
    }


In [13]:
print("Dataset: ", dataset)
print('-' * 50)
stats = compute_stats(corpus, queries, qrels)
print(stats)

Dataset:  nfcorpus
--------------------------------------------------
{'num_docs': 3633, 'num_queries': 323, 'avg_doc_length': np.float64(219.6559317368566), 'avg_query_length': np.float64(5.343653250773994), 'num_unique_words': 36735}


In [19]:
print("Dataset: ", dataset)
print('-' * 50)
stats = compute_stats(corpus, queries, qrels)
print(stats)

Dataset:  scifact
--------------------------------------------------
{'num_docs': 5183, 'num_queries': 300, 'avg_doc_length': np.float64(200.8105344395138), 'avg_query_length': np.float64(14.63), 'num_unique_words': 50571}


In [25]:
print("Dataset: ", dataset)
print('-' * 50)
stats = compute_stats(corpus, queries, qrels)
print(stats)

Dataset:  trec-covid
--------------------------------------------------
{'num_docs': 171332, 'num_queries': 50, 'avg_doc_length': np.float64(148.19161044054817), 'avg_query_length': np.float64(11.86), 'num_unique_words': 383990}
