In [1]:
!pip install beir
!pip install elasticsearch
!pip install -U sentence-transformers
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting beir
  Downloading beir-1.0.1.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.3/50.3 KB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss_cpu
  Downloading faiss_cpu-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting elasticsearch==7.9.1

In [2]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval import models

import pathlib, os, string
from tqdm.notebook import tqdm

import time
import numpy as np
import pandas as pd
import random

from sentence_transformers import SentenceTransformer

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Running on {device}")

  from tqdm.autonotebook import tqdm


Running on cuda


## Download and setup the Elasticsearch instance

In [3]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.9.2/
shasum -a 512 -c elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512 

elasticsearch-oss-7.9.2-linux-x86_64.tar.gz: OK


In [4]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch

In [5]:
# Sleep for few seconds to let the instance start.
time.sleep(20)

In [6]:
%%bash

ps -ef | grep elasticsearch

root        1192    1190  0 08:14 ?        00:00:00 sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch
daemon      1193    1192 82 08:14 ?        00:00:16 /content/elasticsearch-7.9.2/jdk/bin/java -Xshare:auto -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -XX:+ShowCodeDetailsInExceptionMessages -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dio.netty.allocator.numDirectArenas=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.locale.providers=SPI,COMPAT -Xms1g -Xmx1g -XX:+UseG1GC -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -Djava.io.tmpdir=/tmp/elasticsearch-389877947211055475 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecoun

In [7]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "ac5c7f503a6c",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "5K2jX0KkTl-vf_T4_nc_hg",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


# Data Loading

In [9]:
def download_dataset(dataset):
  data_path = f'datasets/{dataset}'
  url = f'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip'
  out_dir = os.path.join(os.getcwd(), 'datasets')
  data_path = util.download_and_unzip(url, out_dir)
  print(f'Dataset downloaded here: {data_path}')
  return GenericDataLoader(data_path).load(split="test")

dataset = 'scifact'
corpus, queries, qrels = download_dataset(dataset)

/content/datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

Dataset downloaded here: /content/datasets/scifact


  0%|          | 0/5183 [00:00<?, ?it/s]

In [18]:
def embeddings(model, corpus, queries):
  retriever = EvaluateRetrieval(model, score_function="dot")

  #### Retrieve dense results (format of results is identical to qrels)
  results = retriever.retrieve(corpus, queries)

  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return retriever.k_values, {'ndcg': ndcg, 'map': _map, 'recall': recall, 'precision': precision}

In [28]:
def print_res(score_dict):
  for score, res in score_dict.items():
    print(score)
    for k, r in res.items():
      print(f'\t{k}\t->\t{r}')
    print('\n')

# Sparse Embeddings with BM25

In [30]:
def sparse_embeddings_bm25(corpus, queries):
  hostname = "localhost" 
  index_name = "scifact" 
  initialize = True # True, will delete existing index with same name and reindex all documents

  model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
  return embeddings(model, corpus, queries)

top_k_sparse, sparse_score_dict = sparse_embeddings_bm25(corpus, queries)
print(f'Retriever evaluation for k in: {top_k_sparse}')
print_res(sparse_score_dict)

  0%|          | 0/5183 [00:00<?, ?docs/s]
que: 100%|██████████| 3/3 [00:13<00:00,  4.41s/it]


Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
ndcg
	NDCG@1	->	0.57667
	NDCG@3	->	0.63658
	NDCG@5	->	0.66524
	NDCG@10	->	0.69064
	NDCG@100	->	0.71337
	NDCG@1000	->	0.7212


map
	MAP@1	->	0.55594
	MAP@3	->	0.61432
	MAP@5	->	0.63124
	MAP@10	->	0.64374
	MAP@100	->	0.64918
	MAP@1000	->	0.6495


recall
	Recall@1	->	0.55594
	Recall@3	->	0.67928
	Recall@5	->	0.74789
	Recall@10	->	0.81978
	Recall@100	->	0.91922
	Recall@1000	->	0.98


precision
	P@1	->	0.57667
	P@3	->	0.24111
	P@5	->	0.162
	P@10	->	0.09067
	P@100	->	0.0104
	P@1000	->	0.00111




# Dense Embeddings with all-MiniLM-L6-v2

In [31]:
'''model = SentenceTransformer(
    'sentence-transformers/all-MiniLM-L6-v2',
    device=device
)

corpus_df = pd.DataFrame.from_dict(corpus, orient='index')
queries_df = pd.DataFrame.from_dict(queries, orient='index')
queries_df.columns = ['text']

batch_size = 128
documents_dense_values = np.empty([corpus_df.shape[0], 384])
queries_dense_values = np.empty([queries_df.shape[0], 384])

for i in tqdm(range(0, len(corpus), batch_size), desc='Documents Dense Embeddings'):
  documents_dense_values[i:i + batch_size] = model.encode(corpus_df.iloc[i:i + batch_size]["text"].tolist())

for i in tqdm(range(0, len(queries), batch_size), desc='Queries Dense Embeddings'):
  queries_dense_values[i:i + batch_size] = model.encode(queries_df.iloc[i:i + batch_size]["text"].tolist())'''

'model = SentenceTransformer(\n    \'sentence-transformers/all-MiniLM-L6-v2\',\n    device=device\n)\n\ncorpus_df = pd.DataFrame.from_dict(corpus, orient=\'index\')\nqueries_df = pd.DataFrame.from_dict(queries, orient=\'index\')\nqueries_df.columns = [\'text\']\n\nbatch_size = 128\ndocuments_dense_values = np.empty([corpus_df.shape[0], 384])\nqueries_dense_values = np.empty([queries_df.shape[0], 384])\n\nfor i in tqdm(range(0, len(corpus), batch_size), desc=\'Documents Dense Embeddings\'):\n  documents_dense_values[i:i + batch_size] = model.encode(corpus_df.iloc[i:i + batch_size]["text"].tolist())\n\nfor i in tqdm(range(0, len(queries), batch_size), desc=\'Queries Dense Embeddings\'):\n  queries_dense_values[i:i + batch_size] = model.encode(queries_df.iloc[i:i + batch_size]["text"].tolist())'

In [32]:
#np.dot(documents_dense_values, queries_dense_values.T)

In [33]:
def dense_embeddings_sbert(corpus, queries):
  model = DRES(models.SentenceBERT("all-MiniLM-L6-v2"), batch_size=16)
  return embeddings(model, corpus, queries)

top_k_dense, dense_score_dict = dense_embeddings_sbert(corpus, queries)
print(f'Retriever evaluation for k in: {top_k_sparse}')
print_res(dense_score_dict)

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/324 [00:00<?, ?it/s]

Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
ndcg
	NDCG@1	->	0.50333
	NDCG@3	->	0.59673
	NDCG@5	->	0.62928
	NDCG@10	->	0.64508
	NDCG@100	->	0.67665
	NDCG@1000	->	0.68552


map
	MAP@1	->	0.48233
	MAP@3	->	0.56564
	MAP@5	->	0.58826
	MAP@10	->	0.59593
	MAP@100	->	0.60307
	MAP@1000	->	0.60343


recall
	Recall@1	->	0.48233
	Recall@3	->	0.66033
	Recall@5	->	0.73794
	Recall@10	->	0.78333
	Recall@100	->	0.925
	Recall@1000	->	0.99333


precision
	P@1	->	0.50333
	P@3	->	0.23778
	P@5	->	0.164
	P@10	->	0.08833
	P@100	->	0.01053
	P@1000	->	0.00112


