In [1]:
!pip install beir
!pip install elasticsearch
!pip install -U sentence-transformers
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting beir
  Downloading beir-1.0.1.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.3/50.3 kB[0m [31m853.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss_cpu
  Downloading faiss_cpu-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting elasticsearch==7.9.

In [2]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval import models

import pathlib, os, string
from tqdm.notebook import tqdm

import spacy

import time
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp
import string

from sentence_transformers import SentenceTransformer

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Application running on {device}")

  from tqdm.autonotebook import tqdm


Application running on cuda


## Download and setup the Elasticsearch instance

In [3]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.9.2/
shasum -a 512 -c elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512 

elasticsearch-oss-7.9.2-linux-x86_64.tar.gz: OK


In [4]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch

In [5]:
for i in tqdm(range(20), desc = 'Let the Elasticsearch instance start'): time.sleep(1)

Let the Elasticsearch instance start:   0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
%%bash

ps -ef | grep elasticsearch

root        1012    1006  0 14:51 ?        00:00:00 sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch
daemon      1013    1012 76 14:51 ?        00:00:15 /content/elasticsearch-7.9.2/jdk/bin/java -Xshare:auto -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -XX:+ShowCodeDetailsInExceptionMessages -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dio.netty.allocator.numDirectArenas=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.locale.providers=SPI,COMPAT -Xms1g -Xmx1g -XX:+UseG1GC -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -Djava.io.tmpdir=/tmp/elasticsearch-2098877689888572733 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecou

In [7]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "e9a5dfd60fbd",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "v7nHvv9lS1OFw5x0XVh5wQ",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


# Data Loading

In [8]:
def download_dataset(dataset):
  data_path = f'datasets/{dataset}'
  url = f'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip'
  out_dir = os.path.join(os.getcwd(), 'datasets')
  data_path = util.download_and_unzip(url, out_dir)
  print(f'Dataset downloaded here: {data_path}')
  return GenericDataLoader(data_path).load(split="test")

dataset = 'scifact'
corpus, queries, qrels = download_dataset(dataset)

/content/datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

Dataset downloaded here: /content/datasets/scifact


  0%|          | 0/5183 [00:00<?, ?it/s]

In [9]:
def embeddings(model, corpus, queries):
  retriever = EvaluateRetrieval(model, score_function='dot')

  #### Retrieve dense results (format of results is identical to qrels)
  results = retriever.retrieve(corpus, queries)

  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return results, retriever.k_values, {'ndcg': ndcg, 'map': _map, 'recall': recall, 'precision': precision}

In [10]:
def print_res(score_dict):
  for score, res in score_dict.items():
    print(score)
    for k, r in res.items():
      print(f'\t{k}\t->\t{r}')
    print('\n')

# Sparse Embeddings with BM25

In [11]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
clean_tokens = lambda tokens : ' '.join([token.lemma_.lower() if token not in stopwords and not token.is_punct else '' for token in tokens])

In [12]:
def pre_process(elem_to_preprocess):
  key, val = elem_to_preprocess
  if type(val) is dict:
    return key, {
        'title': clean_tokens(nlp(val['title'])),
        'text': clean_tokens(nlp(val['text']))
    }
  else: return key, clean_tokens(nlp(val))

def query_documents_preprocessing(documents, queries):
  new_queries = {}
  new_documents = {}

  for text, iter, res in zip(('Documents', 'Queries'), (documents, queries), (new_documents, new_queries)):
    with ProcessPoolExecutor(max_workers=mp.cpu_count()) as executor:
      for id, query_doc in list(tqdm(executor.map(pre_process, iter.items()), total=len(iter.items()), desc=f'{text} Pre-Processing')):
        res[id] = query_doc

  return new_documents, new_queries
  
processed_corpus, processed_queries = query_documents_preprocessing(corpus, queries)

Documents Pre-Processing:   0%|          | 0/5183 [00:00<?, ?it/s]

Queries Pre-Processing:   0%|          | 0/300 [00:00<?, ?it/s]

In [13]:
def sparse_embeddings_bm25(corpus, queries):
  hostname = 'localhost' 
  index_name = 'scifact' 
  initialize = True # True, will delete existing index with same name and reindex all documents

  model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
  return embeddings(model, corpus, queries)

sparse_vector, k_values_sparse, sparse_score_dict = sparse_embeddings_bm25(processed_corpus, processed_queries)
print(f'\nSparse retrieved evaluation for k in: {k_values_sparse}')
print_res(sparse_score_dict)

  0%|          | 0/5183 [00:00<?, ?docs/s]
que: 100%|██████████| 3/3 [00:17<00:00,  5.83s/it]


Sparse retrieved evaluation for k in: [1, 3, 5, 10, 100, 1000]
ndcg
	NDCG@1	->	0.58
	NDCG@3	->	0.64035
	NDCG@5	->	0.67004
	NDCG@10	->	0.69265
	NDCG@100	->	0.71437
	NDCG@1000	->	0.72303


map
	MAP@1	->	0.55928
	MAP@3	->	0.61821
	MAP@5	->	0.63546
	MAP@10	->	0.64661
	MAP@100	->	0.65181
	MAP@1000	->	0.65219


recall
	Recall@1	->	0.55928
	Recall@3	->	0.68261
	Recall@5	->	0.75456
	Recall@10	->	0.81867
	Recall@100	->	0.91422
	Recall@1000	->	0.98


precision
	P@1	->	0.58
	P@3	->	0.24222
	P@5	->	0.16333
	P@10	->	0.09033
	P@100	->	0.01033
	P@1000	->	0.00111







# Dense Embeddings with SentenceBert and all-MiniLM-L6-v2

In [14]:
def dense_embeddings_sbert(corpus, queries):
  model = DRES(models.SentenceBERT('all-MiniLM-L6-v2'), batch_size=16)
  return embeddings(model, corpus, queries)

dense_vector, k_values_dense, dense_score_dict = dense_embeddings_sbert(corpus, queries)
print(f'\nDense etrieved evaluation for k in: {k_values_dense}')
print_res(dense_score_dict)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/324 [00:00<?, ?it/s]


Dense etrieved evaluation for k in: [1, 3, 5, 10, 100, 1000]
ndcg
	NDCG@1	->	0.50333
	NDCG@3	->	0.59673
	NDCG@5	->	0.62928
	NDCG@10	->	0.64508
	NDCG@100	->	0.67665
	NDCG@1000	->	0.68552


map
	MAP@1	->	0.48233
	MAP@3	->	0.56564
	MAP@5	->	0.58826
	MAP@10	->	0.59593
	MAP@100	->	0.60307
	MAP@1000	->	0.60343


recall
	Recall@1	->	0.48233
	Recall@3	->	0.66033
	Recall@5	->	0.73794
	Recall@10	->	0.78333
	Recall@100	->	0.925
	Recall@1000	->	0.99333


precision
	P@1	->	0.50333
	P@3	->	0.23778
	P@5	->	0.164
	P@10	->	0.08833
	P@100	->	0.01053
	P@1000	->	0.00112




# Merging Sparse and Dense Embeddings

In [15]:
def merging(sparse_vector, dense_vector):
  merged_results = {}
  k_values = [1, 3, 5, 10, 100, 1000]

  for (query, sparse_dic), (_, dense_dic) in tqdm(zip(sparse_vector.items(), dense_vector.items()), total=len(sparse_vector.items()), desc='Obtaining the merged vector'):
    merged_results[query] = {doc_id: sparse_dic.get(doc_id, 0) + 1000 * dense_dic.get(doc_id, 0) for doc_id in set(sparse_dic) | set(dense_dic)}

  ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, merged_results, k_values=k_values)
  return merged_results, k_values, {'ndcg': ndcg, 'map': _map, 'recall': recall, 'precision': precision}

merged_vector, k_values_merged, merged_scores_dict = merging(sparse_vector, dense_vector)
print(f'\nMerged retrieved evaluation for k in: {k_values_merged}')
print_res(merged_scores_dict)

Obtaining the merged vector:   0%|          | 0/300 [00:00<?, ?it/s]


Merged retrieved evaluation for k in: [1, 3, 5, 10, 100, 1000]
ndcg
	NDCG@1	->	0.54
	NDCG@3	->	0.62265
	NDCG@5	->	0.65219
	NDCG@10	->	0.66939
	NDCG@100	->	0.69986
	NDCG@1000	->	0.70808


map
	MAP@1	->	0.519
	MAP@3	->	0.59481
	MAP@5	->	0.61583
	MAP@10	->	0.62439
	MAP@100	->	0.63121
	MAP@1000	->	0.63158


recall
	Recall@1	->	0.519
	Recall@3	->	0.67728
	Recall@5	->	0.74628
	Recall@10	->	0.795
	Recall@100	->	0.93167
	Recall@1000	->	0.99333


precision
	P@1	->	0.54
	P@3	->	0.24333
	P@5	->	0.166
	P@10	->	0.08967
	P@100	->	0.0106
	P@1000	->	0.00112


