In [1]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

  from .autonotebook import tqdm as notebook_tqdm


### Import FAQ document with doc id
* Import
* Embed
* Index into ES Index

In [2]:
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [4]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [02:38<00:00,  5.97it/s]


In [5]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 224.23it/s]


## Hybrid search example

### Hybrid search using Elasticsearch

Excerpts from [ES hybrid retrieval](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#_combine_approximate_knn_with_other_features):

You can perform hybrid retrieval by providing both the knn option and a query.

This search finds the global top ```k = 5``` vector matches, combines them with the matches from the match query, and finally returns the ```n``` top-scoring results. The knn and query matches are combined through a disjunction, as if you took a boolean or between them. The top k vector results represent the global nearest neighbors across all index shards.

The score of each hit is the sum of the knn and query scores. You can specify a ```boost``` value to give a weight to each score in the sum. In the example above, the scores will be calculated as:

score $= 0.9 *$ match_score $+ 0.1 *$ knn_score

In [7]:
course = "data-engineering-zoomcamp"

In [8]:
query = 'I just discovered the course. Can I still join it?'

In [9]:
v_q = model.encode(query)

Example using equal weights for both text and vector search

In [10]:
knn_query = {
    "field": "text_vector",
    "query_vector": v_q,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
    "filter": {
        "term": {
            "course": course
        }
    }
}

In [11]:
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["question^3", "text", "section"],
                "type": "best_fields",
                "boost": 0.5,
            }
        },
        "filter": {
            "term": {
                "course": course
            }
        }
    }
}

In [12]:
response = es_client.search(
    index=index_name,
    query=keyword_query,
    knn=knn_query,
    size=5
)

In [13]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'Wd37b5EB0qdHcmM0gxPM',
  '_score': 36.424633,
  '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
   'section': 'General course-related questions',
   'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'id': '7842b56a',
   'question_vector': [0.0030358838848769665,
    -0.002387235639616847,
    0.03588167205452919,
    0.02099882997572422,
    -0.018282383680343628,
    0.06715091317892075,
    -0.10277319699525833,
    -0.11509542167186737,
    -0.06606754660606384,
    -0.00497332913801074,
    -0.002861765446141362,
    0.10543150454759598,
    -0.0008143145823851228,
    0.08418367803096771,
    0.02704714424908161,
    -0.03135376051068306,
    -0.0515432134270668,
    -0.04948992282152176,


## Hybrid search pipeline

Create a hybrid search pipeline that will loop through the ground-truth-data. 

For each record in ground truth data, it will:

* Create a vector version of the question.
* Pass the field for vector search, question, vectorised question, course into elastic_search_hybrid().
* The elastic_search_hybrid() does the following:
    * vector search (using knn) using vectorised question.
    * keyword search (using multi match) using question.
    * combines the results in search_query to return only the top 5 results using the hybrid scoring method. Note that both vector search and keyword search have equal weights of ```boost=0.5```.
    * combine the results into a list and return to caller.
* The document id from ground truth data and the search results returned will be compared and aggregated for relevancy evaluation later.

Once all records in ground truth data are processed, the relevancy scores are computed using Hit Rate and Mean Reciprocal Rank.

In [14]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [15]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [16]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [17]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [18]:
def elastic_search_hybrid(field, query, vector, course):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

### Hybrid search 1

For vector search, we specified `question_vector` as our vector search field.

In [19]:
def question_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid('question_vector', question, v_q, course)

In [20]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [21]:
evaluate(ground_truth, question_hybrid)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [03:12<00:00, 24.09it/s]


{'hit_rate': 0.9234925437648585, 'mrr': 0.8481665586052878}

Comparing the Hit Rate and MRR scores from Hybrid search with the vector search result in Module 3:

ES knn on questions: `{'hit_rate': 0.773071104387292, 'mrr': 0.6666810748505158}`

Clearly there was a lot of improvement using Hybrid search!

### Hybrid search 2

For vector search, we specified `text_vector` as our vector search field.

In [22]:
def text_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid('text_vector', question, v_q, course)

In [23]:
evaluate(ground_truth, text_hybrid)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [03:02<00:00, 25.34it/s]


{'hit_rate': 0.9234925437648585, 'mrr': 0.8461710251422809}

Again we compare the Hit Rate and MRR scores from Hybrid search with the vector search result in Module 3:

ES knn on texts: `{'hit_rate': 0.8286146531229739, 'mrr': 0.7062315395144454}`

Clearly there was a lot of improvement using Hybrid search, also notice that even with using a different vector search field `text_vector`, the hybrid search result do not differ much than using `question_vector`. This is different from module 3, where a vector search using different vector search fields produce rather different hit rates and MRR.

### Hybrid search 3

For vector search, we specified `question_text_vector` as our vector search field.

In [24]:
def question_text_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid('question_text_vector', question, v_q, course)

evaluate(ground_truth, question_text_hybrid)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [03:03<00:00, 25.25it/s]


{'hit_rate': 0.9250054030689432, 'mrr': 0.8506231539514445}

Again we compare the Hit Rate and MRR scores from Hybrid search with the vector search result in Module 3:

ES knn on questions and answers: `{'hit_rate': 0.9172249837907932, 'mrr': 0.824306606152295}`

The `question_text_vector` produces the best results in both hybrid search and vector search from module 3.

## Reranking

By default, RRF isn't available in a free-tier subscription. But you can try to use 30-day trial or upgrade the subscription plan.

To use the Reciprocal rank fusion (RRF) score we need to pull the docker image with a more recent version of Elasticsearch:
```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.9.0
```

**NOTE**: I will skip the code chunks that uses Elasticsearch's own RRF as I'm not a subscriber

In [None]:
def elastic_search_hybrid_rrf(field, query, vector, course):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "rank": {
            "rrf": {}
        },
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
course = "data-engineering-zoomcamp"

In [None]:
query = 'I just discovered the course. Can I still join it?'

In [None]:
v_q = model.encode(query)

In [None]:
elastic_search_hybrid_rrf('question_text_vector', query, v_q, course)

### RRF implementation

Another option to not use the Enterprise version is to create our own method that performs RRF. 

We will refer to the [RRF formula available here](https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html).
```python
score = 0.0
for q in queries:
    if d in result(q):
        score += 1.0 / ( k + rank( result(q), d ) )
return score

# where
# k is a ranking constant
# q is a query in the set of queries
# d is a document in the result set of q
# result(q) is the result set of q
# rank( result(q), d ) is d's rank within the result(q) starting from 1
```

The rank constant `k` determines how much influence documents in individual result sets per query have over the final ranked result set. A higher value indicates that lower ranked documents have more influence. This value must be greater than or equal to 1. Defaults to 60 in ES.

### Basic Example to compute RRF

Source: [RRF full example](https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html#rrf-full-example)

Say for text search, we have results:

|ranks | index id |
|------|----------|
|rank 1| _id 4 |
|rank 2| _id 3 |
|rank 3| _id 2 |
|rank 4| _id 1 |

and for vector search, we have results:
|ranks | index id |
|------|----------|
|rank 1| _id 3 |
|rank 2| _id 2 |
|rank 3| _id 1 |
|rank 4| _id 5 |

we set rank constant `k=1`, and compute the RRF for each index:
```python
# doc  | query     | knn       | score
_id: 1 = 1.0/(1+4) + 1.0/(1+3) = 0.4500
_id: 2 = 1.0/(1+3) + 1.0/(1+2) = 0.5833
_id: 3 = 1.0/(1+2) + 1.0/(1+1) = 0.8333
_id: 4 = 1.0/(1+1)             = 0.5000
_id: 5 =             1.0/(1+4) = 0.2000
```

Taking only top 3 results from RRF, we end with `_id: 3` as `_rank: 1`, `_id: 2` as `_rank: 2`, and `_id: 4` as `_rank: 3`. This ranking matches the result set from the original RRF search as expected.

## Hybrid search with Document Reranking using Reciprocal Rank Fusion (RRF)

We will be enhancing the hybrid search 3 method with RRF

### How our own RRF works in elastic_search_hybrid_rrf()

The RRF is used to rerank the search results for a **single query**:
* Retrieve vector-based results using ES search with knn_query
* Retrieve text-based results using ES search with keyword_query
* Initialise an empty dictonary `rrf_score` which stores key-value pairs of `_id`:`score`, `_id` is the unique ES index that is generated during indexing and not the doc_id.
* For each result in vector-based results (which is already sorted in top 10 results):
    * Compute the rrf score.
    * Write a key-value pair of `_id`:`score` into the dictonary `rrf_score`.
* For each result in text-based results (which is already sorted in top 10 results):
    * Compute the rrf score.
    * If `rrf_score[_id]` exists already, increment its rrf score, else write a key-value pair of `_id`:`score` into the dictonary `rrf_score`.
* Reorder the dictonary `rrf_score` based on `score` and return only the top 5 results' `doc_id` and `source` for this query.

In [25]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, course, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results

In [26]:
def question_text_hybrid_rrf(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid_rrf('question_text_vector', question, v_q, course)

evaluate(ground_truth, question_text_hybrid_rrf)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [04:08<00:00, 18.62it/s]


{'hit_rate': 0.9520207477847418, 'mrr': 0.8745911677833017}

We compare the Hit Rate and MRR scores from Hybrid search 3:

ES hybrid search scores: `{'hit_rate': 0.9250054030689432, 'mrr': 0.8506231539514445}`

Notice that both scores improved once RRF is implemented as part of Hybrid search

### Not part of original lesson in module 6.3

### Experimentation on RRF by reordering knn_search and keyword_search together

In [34]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, course, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']

    # Reorder knn_results and keyword_results
    overall_dict = {}
    for r in knn_results:
        overall_dict[r['_id']] = r['_score']
    for r in keyword_results:
        overall_dict[r['_id']] = r['_score']
    overall_results = sorted(overall_dict.items(), key=lambda x: x[1], reverse=True)

    # print(overall_results[:3])
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    # for rank, hit in enumerate(knn_results):
    #     doc_id = hit['_id']
    #     rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(overall_results):
        # doc_id = hit['_id']
        doc_id = hit[0]
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results

In [35]:
def question_text_hybrid_rrf(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid_rrf('question_text_vector', question, v_q, course)

evaluate(ground_truth, question_text_hybrid_rrf)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [05:37<00:00, 13.71it/s]


{'hit_rate': 0.9152798789712556, 'mrr': 0.8371767163749005}

We compare this experimentation with Hit Rate and MRR scores from Hybrid search 3:

ES hybrid search scores: `{'hit_rate': 0.9250054030689432, 'mrr': 0.8506231539514445}`

Notice that both scores actually dropped slightly compared to the Hybrid search with RRF method.

**Conclusion: RRF without combining individual result sets + reordering them works better.**