In [5]:
# To be able to import from other directories
import sys
sys.path.append('../src')
sys.path.append('../utils')

In [3]:
import numpy as np
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from typing import Callable, Any
import pandas as pd
import text_retrieval_metrics
import elastic_search_engine

In [10]:
es_client = Elasticsearch(['http://localhost:9200'])
es_client.info()

ObjectApiResponse({'name': '8e1ef4b96af3', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'JTKSnG6gQqiQoN9BWFaXBQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Evaluating retrieval

In [2]:
df_ground_truth = pd.read_csv('../data/ground_truth_data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
def evaluatin_retrieval(elastic_searcher: elastic_search_engine.ElasticSearcher,
                        function_applied_to_document: Callable[[dict], Any]):
    relevance_total = []
    for entry in tqdm(ground_truth):
        doc_id = entry['doc_id']
        results = elastic_searcher.search(input_argument=function_applied_to_document(entry))
        relevance = [document['id'] == doc_id for document in results]
        relevance_total.append(relevance)
    print(f'Total relevance: {text_retrieval_metrics.hit_rate(relevance_total)}')    
    print(f'MRR: {text_retrieval_metrics.mrr(relevance_total)}')
    

### Semantic search

In [25]:
semantic_searcher = elastic_search_engine.ElasticSemanticSearcher(index_name='vague-actual')

In [4]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [34]:
def encode_vague_part_of_entry(entry: dict) -> np.array:
    return model.encode(entry['vague'])    

In [36]:
evaluatin_retrieval(semantic_searcher, encode_vague_part_of_entry)

  0%|          | 0/4075 [00:00<?, ?it/s]

Total relevance: 0.8768098159509202
MRR: 0.8103517382413096


#### Semantic search with "all-mpnet-base-v2"

In [10]:
semantic_searcher_mpnet = elastic_search_engine.ElasticSemanticSearcher(index_name='vague-actual-mpnet')

In [11]:
model_mpnet = SentenceTransformer('all-mpnet-base-v2')



In [12]:
def encode_mpnet__vague_part_of_entry(entry: dict) -> np.array:
    return model_mpnet.encode(entry['vague'])  

In [13]:
evaluatin_retrieval(semantic_searcher_mpnet, encode_mpnet__vague_part_of_entry)

  0%|          | 0/4075 [00:00<?, ?it/s]

Total relevance: 0.9715337423312883
MRR: 0.932838445807771


#### Keyword search

In [4]:
keyword_searcher = elastic_search_engine.ElasticKeywordSearcher(index_name='vague_actual_keyword')

In [5]:
def take_vague_part_of_entry(entry: dict) -> str:
    return entry['vague']

In [6]:
evaluatin_retrieval(keyword_searcher, take_vague_part_of_entry)

  0%|          | 0/4075 [00:00<?, ?it/s]

Total relevance: 0.9550920245398773
MRR: 0.9002699386503068
