In [84]:
from elasticsearch import Elasticsearch
import ir_datasets
import re
from collections import Counter
import json

from ir_datasets.formats.base import GenericQuery, GenericDoc, GenericQrel
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [7]:
es = Elasticsearch(
    ["https://localhost:9200"],
    http_auth=('elastic', '<your password>'),
    use_ssl=True,
    verify_certs=False # Only set this to False for local testing
)

### Load the Dataset

In [10]:
dataset = ir_datasets.load("beir/fiqa")

In [20]:
dataset = ir_datasets.load("beir/fiqa/dev")


# List of all the documents in the Dataset -> doc_id: Document ID, text: Document content
docs = list(dataset.docs_iter())
# List of all test queries -> query_id: Query ID, text: Query content
queries = list(dataset.queries_iter())
# List of all related queries and documents: query_id: Query ID, doc_id: Document ID, relevance and iteration not important for analysis
qrels= list(dataset.qrels_iter())

In [22]:
type(queries[0])

ir_datasets.formats.base.GenericQuery

### Load all the Documents in the elastic search Index

In [None]:
# Create a new elastic search index

# es.indices.create(index = "fiqa")

In [None]:
# Add all the documents to the fiqa index. This takes a bit of time

# for doc in dataset.docs_iter():
#     res = es.index(index='fiqa', id=doc.doc_id, body={"text": doc})

### Baseline Performance of Default Elastic Search

We make use of the Rank Eval API in Elastic Search to benchmark the performance of our Information Retrieval system. We consider the Mean Reciprocal Rank @ 100 for benchmarking the performance of our IR system. For more information refer to [Rank Eval Documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-rank-eval.html) and [Mean Reciprocal Rank](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)

In [24]:
def return_query_mapping(query_list: list[GenericQuery]) -> dict[str, str]:
    """Returns mapping between query id and query text"""
    return {q.query_id: q.text for q in query_list}

In [91]:
def create_evaluation_body(query_map: dict[str, str], qrel_map: dict[str, list[str]], index_name = "fiqa"):
    """Creates Evaluation Body in the format required by rank eval API
    
    
    Args:
        query_map: mapping between query ID and query content
        qrel_map: Mapping between query ID and list of tuples
            representing relevant documents. (doc id, doc relevance)
    """
    
    evaluation_body = {
        "requests": [],
        "metric": {
            "mean_reciprocal_rank": {
                "k": 100,
                "relevant_rating_threshold": 1
            }
        }
    }
    
    for query_id, query_text in query_map.items():
        relevant_docs_list = qrel_map.get(query_id, [])
        ratings = [{"_index": index_name, "_id": doc_id, "rating": rating} for doc_id, rating in relevant_docs_list]
    
        evaluation_body["requests"].append({
            "id": query_id,
            "request": {"query": {"match": {"text": query_text}}},
            "ratings": ratings
        })

    
    return evaluation_body

In [54]:
query_map = return_query_mapping(queries)

In [55]:
qrels_mapping = {}

for qrel in qrels:

    if qrel.query_id in qrels_mapping:
        qrels_mapping[qrel.query_id].append((qrel.doc_id, qrel.relevance))
    else:
        qrels_mapping[qrel.query_id] = [(qrel.doc_id, qrel.relevance)]

In [56]:
eval_body = create_evaluation_body(query_map, qrels_mapping)

In [57]:
response = es.rank_eval(index = "fiqa", body=eval_body, ignore_unavailable=True)

In [58]:
response["metric_score"]

0.2946920634920635

### Strategy for Relevance Tuning

To improve the relevance tuning of our Information Retrieval Systems we use the following standard techniques: Stemming, Lowercasing the characters and removing Stop words from the Index. The implementation can be found here: "squirro_chatbot/elastic_search_configs/analyser_config.json"

### Figure out Optimal Stop words

We look at the most common words appearing in these queries and removing them from the index so that the documents with higher occurances of these words do not get retrieved often

In [77]:
queries_text = [q.text for q in queries]

# Example list of questions
questions = queries_text

# Tokenize and normalize the text
words = [re.findall(r'\b\w+\b', question.lower()) for question in questions]

# Flatten the list of lists into a single list of words
flat_words = [word for sublist in words for word in sublist]

# Optional: Define a set of stop words
# stop_words = set(["is", "are", "you", "your", "what", "how", "do", "this"])
stop_words = set()

# Filter out stop words
filtered_words = [word for word in flat_words if word not in stop_words]

# Count occurrences
word_counts = Counter(filtered_words)

# Display the 5 most common words
most_common_words = word_counts.most_common(5)

print("Most common words:", word_counts.most_common(35))

Most common words: [('a', 205), ('to', 178), ('the', 151), ('in', 104), ('is', 101), ('how', 101), ('of', 98), ('what', 98), ('i', 97), ('for', 93), ('stock', 64), ('and', 60), ('can', 58), ('do', 57), ('on', 50), ('my', 48), ('are', 42), ('from', 39), ('it', 38), ('with', 36), ('or', 35), ('an', 34), ('tax', 34), ('does', 34), ('why', 33), ('s', 32), ('money', 30), ('if', 30), ('price', 29), ('when', 28), ('company', 26), ('interest', 24), ('be', 24), ('that', 23), ('credit', 23)]


### Reindex Elastic Search

In [86]:
# Load new elastic search config
# Please 
with open('/Users/tushargoel/Desktop/code/squirro/squirro_chatbot/squirro_chatbot/elastic_search_configs/analyser_config.json') as f:
    config = json.load(f)

In [None]:
# new_index_name = 'fiqa_analyser'

# # Create the new index with settings and mappings from config.json
# es.indices.create(index=new_index_name, body=config)

In [None]:
# reindex_body = {
#   "source": {
#     "index": "fiqa"
#   },
#   "dest": {
#     "index": new_index_name
#   }
# }

# es.reindex(body=reindex_body, wait_for_completion=True)

### Results of Reindexed Elastic Search

In [92]:
eval_body = create_evaluation_body(query_map, qrels_mapping, index_name="fiqa_analyser")

In [93]:
response = es.rank_eval(index = "fiqa_analyser", body=eval_body, ignore_unavailable=True)

In [94]:
print(response["metric_score"])

0.33321182740227345
