# Evaluating SPLADE (Sparse Lexical and Expansion Model for First Stage Ranking) with BEIR

This notebook gives a minimal example usage for evaluating SPLADE on the BEIR benchmark (https://github.com/UKPLab/beir). 

Below are the BEIR performances of our distilled model (`../weights/distilsplade_max`) (see: https://arxiv.org/abs/2109.10086 for more details on training SPLADE with Margin-MSE distillation).

|      dataset      | NDCG@10 | Recall@10 |
|:-----------------:|:-------:|:---------:|
|      ArguAna      |  0.479  |   97.23   |
|   Climate-FEVER   |  0.235  |   52.43   |
|      DBPedia      |  0.435  |   57.52   |
|       FEVER       |  0.786  |   95.14   |
|    FiQA-2018      |  0.336  |   62.10   |
|      HotpotQA     |  0.684  |   82.03   |
|      NFCorpus     |  0.334  |   27.71   |
|         NQ        |  0.521  |   93.05   |
|       Quora       |  0.838  |   98.69   |
|      SCIDOCS      |  0.158  |   36.43   |
|      SciFact      |  0.693  |   92.03   |
|     TREC-COVID    |  0.710  |   54.98   |
|  Touché-2020 (v1) |  0.364  |   35.39   |
|    Avg. zero-shot |  0.506  |   66.89   |

In [1]:
from models import Splade, BEIRSpladeModel
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

2022-02-22 16:18:34.822398: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-22 16:18:34.822442: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# set the dir for trained weights
# NOTE: this version only works for max agg in SPLADE, so the two directories below !
# If you want to use old weights ("../weights/flops_best" and "../weights/flops_efficient") for BEIR benchmark,
# change the SPLADE aggregation in SPLADE forward in models.py

# model_type_or_dir = "../weights/distilsplade_max"
# model_type_or_dir = "../weights/splade_max"
model_type_or_dir = "../output/distilSplade_0.1_0.08_vocab-transformers-msmarco-distilbert-word2vec256k-MLM_785k_emb_updated-batch_size_32-2022-02-21_20-51-33/10000/0_MLMTransformer"

In [3]:
# loading model and tokenizer

model = Splade(model_type_or_dir)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_type_or_dir)
beir_splade = BEIRSpladeModel(model, tokenizer)

In [5]:
beir_splade.model

Splade(
  (transformer): DistilBertForMaskedLM(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(256000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FFN(
          

In [None]:
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir import util, LoggingHandler

dataset = "nfcorpus"

url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = "dataset/{}".format(dataset)
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data path where nfcorpus has been downloaded and unzipped to the data loader
# data folder would contain these files: 
# (1) nfcorpus/corpus.jsonl  (format: jsonlines)
# (2) nfcorpus/queries.jsonl (format: jsonlines)
# (3) nfcorpus/qrels/test.tsv (format: tsv ("\t"))

corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

In [None]:
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.evaluation import EvaluateRetrieval

with torch.cuda.amp.autocast():
    dres = DRES(beir_splade)
    retriever = EvaluateRetrieval(dres, score_function="dot")
    results = retriever.retrieve(corpus, queries)
    ndcg, map_, recall, p = EvaluateRetrieval.evaluate(qrels, results, [1, 10, 100, 1000])
    results2 = EvaluateRetrieval.evaluate_custom(qrels, results, [1, 10, 100, 1000], metric="r_cap")
    res = {"NDCG@10": ndcg["NDCG@10"],
           "Recall@100": recall["Recall@100"],
           "R_cap@100": results2["R_cap@100"]}
    print("res for {}:".format(dataset), res, flush=True)