In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import time
from elasticsearch import Elasticsearch, helpers


In [3]:
es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': 'e22daf50a33f',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'VVaP-BPiSyKO_yho-aX1Hg',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [4]:
eval_dataset = load_dataset("nlplabtdtu/xquad_benchmark",split='train')

In [5]:
eval_dataset

Dataset({
    features: ['question', 'contexts'],
    num_rows: 1190
})

In [6]:
unique_context = []
for row in eval_dataset:
    # print(row)
    for context in row['contexts']:
        unique_context.append(context)
print(len(unique_context))
unique_context = list(set(unique_context))
print(len(unique_context))

5950
240


In [None]:
distil_sbert_model = SentenceTransformer('nlplabtdtu/distil-sbert-base-uncased')
sbert_70M_model = SentenceTransformer('nlplabtdtu/sbert-70M-cased')
sbert_30M_model = SentenceTransformer('nlplabtdtu/sbert-30M-uncased')
miniLM_model = SentenceTransformer('nlplabtdtu/sbert-all-MiniLM-L6-v2')
gte_small_model = SentenceTransformer('nlplabtdtu/gte-small')

In [8]:
if not es.indices.exists(index="eval_data_index"):
    try:
      es_index = {
        "mappings": {
          "properties": {

            "body": {
              "type": "text"
            },
            "body_distil_vector": {
              "type": "dense_vector",
              "dims": 768
            },
            "body_70M_vector": {
              "type": "dense_vector",
              "dims": 768
            },
            "body_30M_vector": {
              "type": "dense_vector",
              "dims": 512
            },
            "body_mini_vector": {
              "type": "dense_vector",
              "dims": 768
            },
            "body_gte_vector": {
              "type": "dense_vector",
              "dims": 384
            },
          }
        }
      }

      es.indices.create(index='eval_data_index', body=es_index, ignore=[400])
      bulk_data = []
      for i in range(len(unique_context)):
        distil_embedding = distil_sbert_model.encode(unique_context[i], show_progress_bar=False)
        sbert_70M_embedding = sbert_70M_model.encode(unique_context[i], show_progress_bar=False)
        sbert_30M_embedding = sbert_30M_model.encode(unique_context[i], show_progress_bar=False)
        miniLM_embedding = miniLM_model.encode(unique_context[i], show_progress_bar=False)
        gte_small_embedding = gte_small_model.encode(unique_context[i], show_progress_bar=False)
        bulk_data.append({
                "_index": 'eval_data_index',
                "_source": {
                    "body": unique_context[i],
                    "body_distil_vector": distil_embedding,
                    "body_70M_vector": sbert_70M_embedding,
                    "body_30M_vector": sbert_30M_embedding,
                    "body_mini_vector": miniLM_embedding,
                    "body_gte_vector": gte_small_embedding
                }
            })
      # print(bulk_data[0])

      helpers.bulk(es, bulk_data)

    except:
        print("During index an exception occured. Continue\n\n")

  es.indices.create(index='eval_data_index', body=es_index, ignore=[400])
  es.indices.create(index='eval_data_index', body=es_index, ignore=[400])


In [9]:
es.indices.refresh(index="eval_data_index")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [10]:
distil_embedding_questions = []
sbert_70M_embedding_questions = []
sbert_30M_embedding_questions = []
miniLM_embedding_questions = []
gte_small_embedding_questions = []
for row in eval_dataset:
    distil_embedding_questions.append(distil_sbert_model.encode(row['question'], show_progress_bar=False))
    sbert_70M_embedding_questions.append(sbert_70M_model.encode(row['question'], show_progress_bar=False))
    sbert_30M_embedding_questions.append(sbert_30M_model.encode(row['question'], show_progress_bar=False))
    miniLM_embedding_questions.append(miniLM_model.encode(row['question'], show_progress_bar=False))
    gte_small_embedding_questions.append(gte_small_model.encode(row['question'], show_progress_bar=False))

In [12]:
#BM25
start_time = time.time()
count_true = 0
for row in eval_dataset:
    inp_question = row['question']
    bm25 = es.search(
        index="eval_data_index", 
        body={"query": 
            {"match": {"body": inp_question }}
        },
        size=10
    )
    for hit in bm25['hits']['hits']:
        if hit['_source']['body'] == row['contexts'][-1]:
            count_true += 1
            break
end_time = time.time()
print("Total time:", end_time - start_time)
print("Accuracy:", count_true/len(eval_dataset))

  bm25 = es.search(


Total time: 21.417574167251587
Accuracy: 0.9840336134453781


In [22]:
import time
start_time = time.time()
count_true = 0
for i in range(len(eval_dataset)):
    # inp_question = row[i]['question']
    gte_small_embedding_question = gte_small_embedding_questions[i]
    sem_search = es.search(index="eval_data_index", body=
                       {
                            "query": {
                                "script_score": {
                                    "query" : {
                                        "match_all": {},
                                    },
                                    "script": {
                                        "source": "cosineSimilarity(params.query_vector, 'body_gte_vector') + 1.0", 
                                        "params": {
                                            "query_vector": gte_small_embedding_question
                                        }
                                    }
                                }
                            }
                        },
                        size=1
    )
    for hit in sem_search['hits']['hits']:
        if hit['_source']['body'] == eval_dataset[i]['contexts'][-1]:
            count_true += 1
            break
end_time = time.time()
print("Total time:", end_time - start_time)
print("Accuracy:", count_true/len(eval_dataset))

  sem_search = es.search(index="eval_data_index", body=


Total time: 14.167500257492065
Accuracy: 0.8025210084033614


In [31]:
import torch
start_time = time.time()
count_true = 0
for i in range(len(eval_dataset)):
    inp_question = eval_dataset[i]['question']
    bm25 = es.search(
        index="eval_data_index", 
        body={"query": 
            {"match": {"body": inp_question }}
        },
        size=10
    )

    encoded_contexts = [hit['_source']['body_gte_vector'] for hit in bm25['hits']['hits']]
    encoded_contexts = torch.tensor(encoded_contexts)
    contexts = [hit['_source']['body'] for hit in bm25['hits']['hits']]
    result = util.semantic_search(torch.tensor(gte_small_embedding_questions[i]), encoded_contexts, top_k=1)
    # print(result)
    for hit in result[0]:
        if contexts[int(hit['corpus_id'])] == eval_dataset[i]['contexts'][-1]:
            count_true += 1
            break
    # if contexts[int(result[0][0]['corpus_id'])] == eval_dataset[i]['contexts'][-1]:
    #     count_true += 1
    # else:
    #     print(f"predict: {contexts[int(result[0][0]['corpus_id'])]}")
    #     print(contexts)
    #     print(inp_question)
    #     break
end_time = time.time()
print("Total time:", end_time - start_time)
print("Accuracy:", count_true/len(eval_dataset))

  bm25 = es.search(


Total time: 31.20016384124756
Accuracy: 0.838655462184874
