In [96]:
!pip install beir
!pip install elasticsearch
!pip install -U sentence-transformers
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [97]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval import models

import pathlib, os, string
from tqdm.notebook import tqdm

import spacy

import time
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp
import string

from sentence_transformers import SentenceTransformer

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Application running on {device}")

Running on cpu


## Download and setup the Elasticsearch instance

In [98]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.9.2/
shasum -a 512 -c elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512 

elasticsearch-oss-7.9.2-linux-x86_64.tar.gz: OK


In [99]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch

In [100]:
# Sleep for few seconds to let the instance start.
time.sleep(60)

In [101]:
%%bash

ps -ef | grep elasticsearch

root       66239   66237  0 11:19 ?        00:00:00 sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch
daemon     66240   66239 75 11:19 ?        00:00:22 /content/elasticsearch-7.9.2/jdk/bin/java -Xshare:auto -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -XX:+ShowCodeDetailsInExceptionMessages -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dio.netty.allocator.numDirectArenas=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.locale.providers=SPI,COMPAT -Xms1g -Xmx1g -XX:+UseG1GC -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -Djava.io.tmpdir=/tmp/elasticsearch-4731648301965488944 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecou

In [103]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "9bd472976494",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "CPyqz8JJSGuWdW665Qvj9g",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


# Data Loading

In [104]:
def download_dataset(dataset):
  data_path = f'datasets/{dataset}'
  url = f'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip'
  out_dir = os.path.join(os.getcwd(), 'datasets')
  data_path = util.download_and_unzip(url, out_dir)
  print(f'Dataset downloaded here: {data_path}')
  return GenericDataLoader(data_path).load(split="test")

dataset = 'scifact'
corpus, queries, qrels = download_dataset(dataset)

Dataset downloaded here: /content/datasets/scifact


  0%|          | 0/5183 [00:00<?, ?it/s]

In [105]:
def embeddings(model, corpus, queries):
  retriever = EvaluateRetrieval(model, score_function="dot")

  #### Retrieve dense results (format of results is identical to qrels)
  results = retriever.retrieve(corpus, queries)

  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return retriever.k_values, {'ndcg': ndcg, 'map': _map, 'recall': recall, 'precision': precision}

In [106]:
def print_res(score_dict):
  for score, res in score_dict.items():
    print(score)
    for k, r in res.items():
      print(f'\t{k}\t->\t{r}')
    print('\n')

# Sparse Embeddings with BM25

In [107]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
clean_tokens = lambda tokens : ' '.join([token.lemma_.lower() if token not in stopwords and not token.is_punct else '' for token in tokens])

In [108]:
def pre_process(text):
  if type(text) == dict: tokens = nlp(text['text'])
  else: tokens = nlp(text)
  return clean_tokens(tokens)

def query_documents_preprocessing(documents, queries):
  new_queries = {}
  new_documents = {}

  for text, iter, res in zip(('Documents', 'Queries'), (documents, queries), (new_documents, new_queries)):
    with ProcessPoolExecutor(max_workers=mp.cpu_count()) as pool:
      with tqdm(total=len(iter.items()), leave=False, desc=f'{text} Pre-Processing') as progress:

        for key, doc in iter.items():
          future = pool.submit(pre_process, doc)
          future.add_done_callback(lambda p: progress.update())
          res[key] = future.result()

  return new_documents, new_queries

In [None]:
def sparse_embeddings_bm25(corpus, queries):
  hostname = "localhost" 
  index_name = "scifact" 
  initialize = True # True, will delete existing index with same name and reindex all documents

  model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
  return embeddings(model, corpus, queries)

processed_corpus, processed_queries = query_documents_preprocessing(corpus, queries)
top_k_sparse, sparse_score_dict = sparse_embeddings_bm25(processed_queries, processed_queries)
print(f'Retriever evaluation for k in: {top_k_sparse}')
print_res(sparse_score_dict)

# Dense Embeddings with all-MiniLM-L6-v2

In [None]:
def dense_embeddings_sbert(corpus, queries):
  model = DRES(models.SentenceBERT("all-MiniLM-L6-v2"), batch_size=16)
  return embeddings(model, corpus, queries)

top_k_dense, dense_score_dict = dense_embeddings_sbert(corpus, queries)
print(f'Retriever evaluation for k in: {top_k_sparse}')
print_res(dense_score_dict)

# Merging Sparse and Dense Embeddings

In [None]:
def merging(sparse_embeddings, dense_embeddings):


  results = retriever.retrieve(corpus, queries)

  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return retriever.k_values, {'ndcg': ndcg, 'map': _map, 'recall': recall, 'precision': precision}