In [5]:
# !pip install hydra-core beir

In [2]:
# setting up environment
    
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '5'

In [7]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, 'data')

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

#### Load the SBERT model and retrieve using cosine-similarity
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(corpus, queries)

#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

2024-04-15 08:51:27 - Downloading scifact.zip ...


data/scifact.zip: 100%|███████████████████████████████████████████████████████████| 2.69M/2.69M [00:01<00:00, 1.81MiB/s]


2024-04-15 08:51:31 - Unzipping scifact.zip ...
2024-04-15 08:51:31 - Loading Corpus...


100%|███████████████████████████████████████████████████████████████████████████| 5183/5183 [00:00<00:00, 118304.04it/s]


2024-04-15 08:51:31 - Loaded 5183 TEST Documents.
2024-04-15 08:51:31 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers

Batches: 100%|██████████████████████████████████████████████████████████████████████████| 19/19 [00:01<00:00, 16.57it/s]


2024-04-15 08:51:57 - Sorting Corpus by document length (Longest first)...
2024-04-15 08:51:57 - Scoring Function: Dot Product (dot)
2024-04-15 08:51:57 - Encoding Batch 1/1...


Batches: 100%|████████████████████████████████████████████████████████████████████████| 324/324 [00:17<00:00, 18.19it/s]


2024-04-15 08:52:16 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-04-15 08:52:16 - 

2024-04-15 08:52:16 - NDCG@1: 0.5333
2024-04-15 08:52:16 - NDCG@3: 0.5990
2024-04-15 08:52:16 - NDCG@5: 0.6215
2024-04-15 08:52:16 - NDCG@10: 0.6428
2024-04-15 08:52:16 - NDCG@100: 0.6698
2024-04-15 08:52:16 - NDCG@1000: 0.6811
2024-04-15 08:52:16 - 

2024-04-15 08:52:16 - MAP@1: 0.5086
2024-04-15 08:52:16 - MAP@3: 0.5730
2024-04-15 08:52:16 - MAP@5: 0.5892
2024-04-15 08:52:16 - MAP@10: 0.5992
2024-04-15 08:52:16 - MAP@100: 0.6046
2024-04-15 08:52:16 - MAP@1000: 0.6049
2024-04-15 08:52:16 - 

2024-04-15 08:52:16 - Recall@1: 0.5086
2024-04-15 08:52:16 - Recall@3: 0.6473
2024-04-15 08:52:16 - Recall@5: 0.6998
2024-04-15 08:52:16 - Recall@10: 0.7615
2024-04-15 08:52:16 - Recall@100: 0.8910
2024-04-15 08:52:16 - Recall@1000: 0.9833
2024-04-15 08:52:16 - 

2024-04-15 08:52:16 - P@1: 0.5333
2024-04-15 08:52:16