In [None]:
#| default_exp evaluation

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#|hide
from nbdev import *
from fastcore.test import *
from fastcore.utils import *

# evaluation
> Reference API related to evaluation function and metrics

In [None]:
#|export
import math
from typing import Dict, List, Union
from fastcore.utils import patch, patch_to
from pandas import DataFrame
from vespa.io import VespaQueryResponse
from vespa.application import Vespa
from learntorank.query import QueryModel, send_query, send_query_batch

## Metrics

Abstract and concrete classes related to evaluation metrics.

In [None]:
#|export
class EvalMetric(object):
    def __init__(self) -> None:
        "Abstract class for evaluation metric."
        pass

In [None]:
#|export 
@patch
def evaluate_query(
    self: EvalMetric,
    query_results,  # Raw query results returned by Vespa.
    relevant_docs,  # Each dict contains a doc id a optionally a doc score.
    id_field,  # The Vespa field representing the document id.
    default_score,  # Score to assign to the additional documents that are not relevant. Default to 0.
    detailed_metrics=False,  # Return intermediate computations if available.
) -> Dict:  # Metric values.
    "Abstract method to be implemented by metrics inheriting from `EvalMetric` to evaluate query results."
    raise NotImplementedError

In [None]:
#|export
class MatchRatio(EvalMetric):
    def __init__(self) -> None:
        "Computes the ratio of documents retrieved by the match phase."
        super().__init__()
        self.name = "match_ratio"

Instantiate the metric:

In [None]:
metric = MatchRatio()

In [None]:
#|export 
@patch
def evaluate_query(
    self: MatchRatio,
    query_results: VespaQueryResponse,  # Raw query results returned by Vespa.
    relevant_docs: List[Dict],  # Each dict contains a doc id a optionally a doc score.
    id_field: str,  # The Vespa field representing the document id.
    default_score: int,  # Score to assign to the additional documents that are not relevant. Default to 0.
    detailed_metrics=False,  # Return intermediate computations if available.
) -> Dict:  # Returns the match ratio. In addition, if `detailed_metrics=False`, returns the number of retrieved docs `_retrieved_docs` and the number of docs available in the corpus `_docs_available`.
    "Evaluate query results according to match ratio metric."
    
    retrieved_docs = query_results.number_documents_retrieved
    docs_available = query_results.number_documents_indexed
    value = 0
    if docs_available > 0:
        value = retrieved_docs / docs_available
    metrics = {
        str(self.name): value,
    }
    if detailed_metrics:
        metrics.update(
            {
                str(self.name) + "_retrieved_docs": retrieved_docs,
                str(self.name) + "_docs_available": docs_available,
            }
        )
    return metrics

In [None]:
#|hide
query_results = VespaQueryResponse(
    {"root": {"fields": {"totalCount": 1083},
              "coverage": {"documents": 62529}}
    }, 
    status_code=None, 
    url=None
)

Compute match ratio:

In [None]:
evaluation = metric.evaluate_query(
    query_results=query_results, 
    relevant_docs=None,
    id_field="vespa_id_field",
    default_score=0,
)
evaluation

{'match_ratio': 0.01731996353691887}

In [None]:
#|hide
test_eq(evaluation,
    {
        "match_ratio": 1083 / 62529,
    },
)

Return detailed metrics, in addition to match ratio:

In [None]:
evaluation = metric.evaluate_query(
    query_results=query_results,
    relevant_docs=None,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)
evaluation

{'match_ratio': 0.01731996353691887,
 'match_ratio_retrieved_docs': 1083,
 'match_ratio_docs_available': 62529}

In [None]:
#|hide
test_eq(
    evaluation,
    {
        "match_ratio_retrieved_docs": 1083,
        "match_ratio_docs_available": 62529,
        "match_ratio": 1083 / 62529,
    }
)

In [None]:
#|hide
# case without 'totalCount'
query_results = VespaQueryResponse(
    {
        "root": {
            "coverage": {
                "documents": 62529,
            },
        }
    }, 
    status_code=None, 
    url=None
)

In [None]:
#|hide
# case without 'totalCount'
evaluation = metric.evaluate_query(
    query_results=query_results,
    relevant_docs=None,
    id_field="vespa_id_field",
    default_score=0,
)

test_eq(
    evaluation,
    {
        "match_ratio": 0 / 62529,
    },
)

In [None]:
#|hide
# case without 'totalCount'
evaluation = metric.evaluate_query(
    query_results=query_results,
    relevant_docs=None,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)

test_eq(
    evaluation,
    {
        "match_ratio_retrieved_docs": 0,
        "match_ratio_docs_available": 62529,
        "match_ratio": 0 / 62529,
    },
)

In [None]:
#|hide
# case without "coverage": {"documents": 62529}
query_results=VespaQueryResponse({
                "root": {
                    "id": "toplevel",
                    "relevance": 1.0,
                    "fields": {"totalCount": 1083},
                    "coverage": {
                        "coverage": 100,
                        "full": True,
                        "nodes": 2,
                        "results": 1,
                        "resultsFull": 1,
                    },
                }
            }, status_code=None, url=None)

In [None]:
#|hide
# case without "coverage": {"documents": 62529}
evaluation = metric.evaluate_query(
    query_results=query_results,
    relevant_docs=None,
    id_field="vespa_id_field",
    default_score=0,
)

test_eq(
    evaluation,
    {
        "match_ratio": 0,
    },
)

In [None]:
#|hide
# case without "coverage": {"documents": 62529}
evaluation = metric.evaluate_query(
    query_results=query_results,
    relevant_docs=None,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)

test_eq(
    evaluation,
    {
        "match_ratio_retrieved_docs": 1083,
        "match_ratio_docs_available": 0,
        "match_ratio": 0,
    },
)


In [None]:
#|export
class Recall(EvalMetric):
    def __init__(
        self, 
        at: int  # Maximum position on the resulting list to look for relevant docs.
    ) -> None:
        "Compute the recall at position `at`."
        super().__init__()
        self.name = "recall_" + str(at)
        self.at = at

Instantiate the metric:

In [None]:
recall_1 = Recall(at=1)
recall_2 = Recall(at=2)
recall_3 = Recall(at=3)

In [None]:
#|export
@patch
def evaluate_query(
    self: Recall,
    query_results: VespaQueryResponse,  # Raw query results returned by Vespa.
    relevant_docs: List[Dict],  # Each dict contains a doc id a optionally a doc score.
    id_field: str,  # The Vespa field representing the document id.
    default_score: int,  # Score to assign to the additional documents that are not relevant. Default to 0.
    detailed_metrics=False,  # Return intermediate computations if available.
) -> Dict:  # Returns the recall value.
    """
    Evaluate query results according to recall metric.

    There is an assumption that only documents with score > 0 are relevant. Recall is equal to zero in case no
    relevant documents with score > 0 is provided.
    """

    relevant_ids = {str(doc["id"]) for doc in relevant_docs if doc.get("score", default_score) > 0}
    try:
        retrieved_ids = {
            str(hit["fields"][id_field]) for hit in query_results.hits[: self.at]
        }
    except KeyError:
        retrieved_ids = set()

    return {str(self.name): len(relevant_ids & retrieved_ids) / len(relevant_ids) if len(relevant_ids) > 0 else 0}

In [None]:
#|hide
query_results = VespaQueryResponse({
    "root": {
        "children": [
            {
                "fields": {
                    "vespa_id_field": "ghi",
                },
            },
            {
                "fields": {
                    "vespa_id_field": "def",
                },
            },
        ],
    }
}, status_code=None, url=None)

In [None]:
#|hide
relevant_docs = [{"id": "def", "score": 1}, {"id": "abc", "score": 1}]

Compute recall:

In [None]:
evaluation = recall_2.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)
evaluation

{'recall_2': 0.5}

In [None]:
#|hide
test_eq(
    evaluation,
    {
        "recall_2": 0.5,
    },
)

In [None]:
#|hide
# same data as above but with recall_1
evaluation = recall_1.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "recall_1": 0.0,
    },
)

In [None]:
#|hide
# There is an additional third hit when compared to 'query_results'
query_results2 = VespaQueryResponse({
    "root": {
        "children": [
            {
                "fields": {
                    "vespa_id_field": "ghi",
                },
            },
            {
                "fields": {
                    "vespa_id_field": "def",
                },
            },
            {
                "fields": {
                    "vespa_id_field": "abc",
                },
            },
        ],
    }
}, status_code=None, url=None)

In [None]:
#|hide
# different relevant scores, score != 1
relevant_docs2 = [{"id": "ghi", "score": 1}, {"id": "abc", "score": 2}]

In [None]:
#|hide
evaluation = recall_3.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs2,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "recall_3": 1,
    },
)

In [None]:
#|hide
# id field is an integer
query_results_int_id = VespaQueryResponse({
    "root": {
        "children": [
            {
                "fields": {
                    "vespa_id_field": 1,
                },
            },
            {
                "fields": {
                    "vespa_id_field": 2,
                },
            },
            {
                "fields": {
                    "vespa_id_field": 3,
                },
            },
        ],
    }
}, status_code=None, url=None)

In [None]:
#|hide
relevant_docs_int_id = [{"id": 1, "score": 1}, {"id": 3, "score": 2}]

In [None]:
#|hide
evaluation = recall_3.evaluate_query(
    query_results=query_results_int_id,
    relevant_docs=relevant_docs_int_id,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "recall_3": 1,
    },
)

In [None]:
#|hide
# relevant docs containing score = 0
relevant_docs_with_zero_score = [{"id": "ghi", "score": 0}, {"id": "abc", "score": 2}]

In [None]:
#|hide
# test recall metric in the presence of score = 0 in the relevant docs
evaluation = recall_1.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs_with_zero_score,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "recall_1": 0,
    },
)

evaluation = recall_2.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs_with_zero_score,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "recall_2": 0,
    },
)

evaluation = recall_3.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs_with_zero_score,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "recall_3": 1,
    },
)

In [None]:
#|hide
# test recall metric with relevant docs containing only score = 0
evaluation = recall_3.evaluate_query(
    query_results=query_results2,
    relevant_docs=[{"id": "ghi", "score": 0}, {"id": "abc", "score": 0}],
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "recall_3": 0,
    },
)

In [None]:
#|hide
# missing 'fields'
query_results_empty_field = VespaQueryResponse({
    "root": {
        "children": [
            {
                "id": "ghi"
            },
            {
                "fields": {
                    "vespa_id_field": "def",
                },
            },
        ],
    }
}, status_code=None, url=None)

In [None]:
#|hide
relevant_docs = [{"id": "def", "score": 1}, {"id": "abc", "score": 1}]

Compute recall:

In [None]:
#|hide
evaluation = recall_2.evaluate_query(
    query_results=query_results_empty_field,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)

test_eq(
    evaluation,
    {
        "recall_2": 0.0,
    },
)

In [None]:
#|export
class ReciprocalRank(EvalMetric):
    def __init__(
        self, 
        at: int  # Maximum position on the resulting list to look for relevant docs.
    ):
        "Compute the reciprocal rank at position `at`"
        super().__init__()
        self.name = "reciprocal_rank_" + str(at)
        self.at = at

Instantiate the metric:

In [None]:
rr_1 = ReciprocalRank(at=1)
rr_2 = ReciprocalRank(at=2)
rr_3 = ReciprocalRank(at=3)

In [None]:
#|export
@patch
def evaluate_query(
    self: ReciprocalRank,
    query_results: VespaQueryResponse,  # Raw query results returned by Vespa.
    relevant_docs: List[Dict],  # Each dict contains a doc id a optionally a doc score.
    id_field: str,  # The Vespa field representing the document id.
    default_score: int,  # Score to assign to the additional documents that are not relevant. Default to 0.
    detailed_metrics=False,  # Return intermediate computations if available.
) -> Dict:  # Returns the reciprocal rank value.
    """
    Evaluate query results according to reciprocal rank metric.

    There is an assumption that only documents with score > 0 are relevant.
    """

    relevant_ids = {str(doc["id"]) for doc in relevant_docs if doc.get("score", default_score) > 0}
    rr = 0
    hits = query_results.hits[: self.at]
    for index, hit in enumerate(hits):
        try:
            if str(hit["fields"][id_field]) in relevant_ids:
                rr = 1 / (index + 1)
                break
        except KeyError:
            rr = 0

    return {str(self.name): rr}

Compute reciprocal rank:

In [None]:
evaluation = rr_2.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)
evaluation

{'reciprocal_rank_2': 0.5}

In [None]:
#|hide
test_eq(
    evaluation,
    {
        "reciprocal_rank_2": 0.5,
    },
)

In [None]:
#|hide
evaluation = rr_1.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "reciprocal_rank_1": 0.0,
    },
)

In [None]:
#|hide
evaluation = rr_3.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs2,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "reciprocal_rank_3": 1.0,
    },
)

In [None]:
#|hide
evaluation = rr_3.evaluate_query(
    query_results=query_results_int_id,
    relevant_docs=relevant_docs_int_id,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "reciprocal_rank_3": 1.0,
    },
)

In [None]:
#|hide
evaluation = rr_1.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs_with_zero_score,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "reciprocal_rank_1": 0.0,
    },
)

In [None]:
#|hide
metric = ReciprocalRank(at=2)
evaluation = rr_2.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs_with_zero_score,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "reciprocal_rank_2": 0.0,
    },
)

In [None]:
#|hide
evaluation = rr_3.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs_with_zero_score,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "reciprocal_rank_3": 1 / 3,
    },
)

In [None]:
#|hide
evaluation = rr_2.evaluate_query(
    query_results=query_results_empty_field,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)
test_eq(
    evaluation,
    {
        "reciprocal_rank_2": 0.5,
    },
)

In [None]:
#|export
class NormalizedDiscountedCumulativeGain(EvalMetric):
    def __init__(
        self, 
        at: int  # Maximum position on the resulting list to look for relevant docs.
    ):
        "Compute the normalized discounted cumulative gain at position `at`."
        super().__init__()
        self.name = "ndcg_" + str(at)
        self.at = at

    @staticmethod    
    def _compute_dcg(scores: List[int]) -> float:
        return sum([score / math.log2(idx + 2) for idx, score in enumerate(scores)])        

Instantiate the metric:

In [None]:
ndcg_1 = NormalizedDiscountedCumulativeGain(at=1)
ndcg_2 = NormalizedDiscountedCumulativeGain(at=2)
ndcg_3 = NormalizedDiscountedCumulativeGain(at=3)

In [None]:
#|export
@patch
def evaluate_query(
    self: NormalizedDiscountedCumulativeGain,
    query_results: VespaQueryResponse,  # Raw query results returned by Vespa.
    relevant_docs: List[Dict],  # Each dict contains a doc id a optionally a doc score.
    id_field: str,  # The Vespa field representing the document id.
    default_score: int,  # Score to assign to the additional documents that are not relevant. Default to 0.
    detailed_metrics=False,  # Return intermediate computations if available.
) -> Dict:  # Returns the normalized discounted cumulative gain. In addition, if `detailed_metrics=False`, returns the ideal discounted cumulative gain `_ideal_dcg`, the discounted cumulative gain `_dcg`.
    """
    Evaluate query results according to normalized discounted cumulative gain.

    There is an assumption that documents returned by the query that are not included in the set of relevant
    documents have score equal to zero. Similarly, if the query returns a number `N < at` documents, we will
    assume that those `N - at` missing scores are equal to zero.
    """

    at = self.at
    relevant_scores = {str(doc["id"]): doc["score"] for doc in relevant_docs}
    assert default_score == 0, "NDCG default score should be zero."
    search_scores = [default_score] * at
    ideal_scores = [default_score] * at

    hits = query_results.hits[:at]
    for idx, hit in enumerate(hits):
        try:
            doc_id = str(hit["fields"][id_field])
            search_scores[idx] = relevant_scores.get(
                doc_id, default_score
            )
        except KeyError:
            search_scores[idx] = default_score

    sorted_score_list = sorted(list(relevant_scores.values()), reverse=True)[:at]
    for idx, score in enumerate(sorted_score_list):
        ideal_scores[idx] = score

    ideal_dcg = self._compute_dcg(ideal_scores)
    dcg = self._compute_dcg(search_scores)

    ndcg = 0
    if ideal_dcg > 0:
        ndcg = dcg / ideal_dcg

    metrics = {
        str(self.name): ndcg,
    }
    if detailed_metrics:
        metrics.update(
            {
                str(self.name) + "_ideal_dcg": ideal_dcg,
                str(self.name) + "_dcg": dcg,
            }
        )
    return metrics


Compute NDCG:

In [None]:
metric = NormalizedDiscountedCumulativeGain(at=2)
evaluation = ndcg_2.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)
evaluation

{'ndcg_2': 0.38685280723454163}

In [None]:
#|hide
expected_dcg = 0 / math.log2(2) + 1 / math.log2(3)
expected_ideal_dcg = 1 / math.log2(2) + 1 / math.log2(3)
expected_ndcg = expected_dcg / expected_ideal_dcg

test_eq(
    evaluation,
    {
        "ndcg_2": expected_ndcg,
    },
)

Return detailed metrics, in addition to NDCG:

In [None]:
evaluation = ndcg_2.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)
evaluation

{'ndcg_2': 0.38685280723454163,
 'ndcg_2_ideal_dcg': 1.6309297535714575,
 'ndcg_2_dcg': 0.6309297535714575}

In [None]:
#|hide
test_eq(
    evaluation,
    {
        "ndcg_2_ideal_dcg": expected_ideal_dcg,
        "ndcg_2_dcg": expected_dcg,
        "ndcg_2": expected_ndcg,
    },
)

In [None]:
#|hide
evaluation = ndcg_1.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
)
expected_dcg = 0 / math.log2(2)
expected_ideal_dcg = 1 / math.log2(2)
expected_ndcg = 0
test_eq(
    evaluation,
    {
        "ndcg_1": expected_ndcg,
    },
)

In [None]:
#|hide 
evaluation = ndcg_1.evaluate_query(
    query_results=query_results,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)
test_eq(
    evaluation,
    {
        "ndcg_1_ideal_dcg": expected_ideal_dcg,
        "ndcg_1_dcg": expected_dcg,
        "ndcg_1": expected_ndcg,
    },
)

In [None]:
#|hide
evaluation = ndcg_3.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs2,
    id_field="vespa_id_field",
    default_score=0,
)
expected_dcg = 1 / math.log2(2) + 0 / math.log2(3) + 2 / math.log2(4)
expected_ideal_dcg = 2 / math.log2(2) + 1 / math.log2(3) + 0 / math.log2(4)
expected_ndcg = expected_dcg / expected_ideal_dcg
test_eq(
    evaluation,
    {
        "ndcg_3": expected_ndcg,
    },
)

In [None]:
#|hide
evaluation = ndcg_3.evaluate_query(
    query_results=query_results2,
    relevant_docs=relevant_docs2,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)
test_eq(
    evaluation,
    {
        "ndcg_3_ideal_dcg": expected_ideal_dcg,
        "ndcg_3_dcg": expected_dcg,
        "ndcg_3": expected_ndcg,
    },
)

In [None]:
#|hide
evaluation = ndcg_3.evaluate_query(
    query_results=query_results_int_id,
    relevant_docs=relevant_docs_int_id,
    id_field="vespa_id_field",
    default_score=0,
)
expected_dcg = 1 / math.log2(2) + 0 / math.log2(3) + 2 / math.log2(4)
expected_ideal_dcg = 2 / math.log2(2) + 1 / math.log2(3) + 0 / math.log2(4)
expected_ndcg = expected_dcg / expected_ideal_dcg
test_eq(
    evaluation,
    {
        "ndcg_3": expected_ndcg,
    },
)

In [None]:
#|hide
evaluation = ndcg_3.evaluate_query(
    query_results=query_results_int_id,
    relevant_docs=relevant_docs_int_id,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)
test_eq(
    evaluation,
    {
        "ndcg_3_ideal_dcg": expected_ideal_dcg,
        "ndcg_3_dcg": expected_dcg,
        "ndcg_3": expected_ndcg,
    },
)

In [None]:
#|hide
evaluation = ndcg_2.evaluate_query(
    query_results=query_results_empty_field,
    relevant_docs=relevant_docs,
    id_field="vespa_id_field",
    default_score=0,
    detailed_metrics=True,
)
expected_dcg = 0 / math.log2(2) + 1 / math.log2(3)
expected_ideal_dcg = 1 / math.log2(2) + 1 / math.log2(3)
expected_ndcg = expected_dcg / expected_ideal_dcg

test_eq(
    evaluation,
    {
        "ndcg_2_ideal_dcg": expected_ideal_dcg,
        "ndcg_2_dcg": expected_dcg,
        "ndcg_2": expected_ndcg,
    },
)

## Evaluation queries in batch

In [None]:
#|export
def _parse_labeled_data(
    df: DataFrame  # DataFrame with the following required columns ["qid", "query", "doc_id", "relevance"].
) -> List[Dict]:  # Concise representation of the labeled data, grouped by query_id and query.
    "Convert a DataFrame with labeled data to format used internally"
    required_columns = ["qid", "query", "doc_id", "relevance"]
    assert all(
        [x in list(df.columns) for x in required_columns]
    ), "DataFrame needs at least the following columns: {}".format(required_columns)
    qid_query = (
        df[["qid", "query"]].drop_duplicates(["qid", "query"]).to_dict(orient="records")
    )
    labeled_data = []
    for q in qid_query:
        docid_relevance = df[(df["qid"] == q["qid"]) & (df["query"] == q["query"])][
            ["doc_id", "relevance"]
        ]
        relevant_docs = []
        for idx, row in docid_relevance.iterrows():
            relevant_docs.append({"id": row["doc_id"], "score": row["relevance"]})
        data_point = {
            "query_id": q["qid"],
            "query": q["query"],
            "relevant_docs": relevant_docs,
        }
        labeled_data.append(data_point)
    return labeled_data

In [None]:
#|hide
labeled_data_df = DataFrame(
    data={
        "qid": [0, 0, 1, 1],
        "query": ["Intrauterine virus infections and congenital heart disease"]
        * 2
        + [
            "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus"
        ]
        * 2,
        "doc_id": [0, 3, 1, 5],
        "relevance": [1, 1, 1, 1],
    }
)
labeled_data = _parse_labeled_data(df=labeled_data_df)
expected_labeled_data = [
    {
        "query_id": 0,
        "query": "Intrauterine virus infections and congenital heart disease",
        "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}],
    },
    {
        "query_id": 1,
        "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
        "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}],
    },
]
test_eq(labeled_data, expected_labeled_data)

In [None]:
#|hide
# parse_labeled_data_with_wrong_columns
labeled_data_df = DataFrame(
    data={
        "qid": [0, 0, 1, 1],
        "doc_id": [0, 3, 1, 5],
        "relevance": [1, 1, 1, 1],
    }
)
test_fail(
    _parse_labeled_data, 
    kwargs={"df":labeled_data_df}, 
    contains="DataFrame needs at least the following columns: ['qid', 'query', 'doc_id', 'relevance']"
)

In [None]:
#|export
def _evaluate_query_retry(app, flat_labeled_data, model, timeout, **kwargs):
    query_responses = send_query_batch(
        app=app,
        query_batch = [x[0] for x in flat_labeled_data], 
        query_model = model, 
        **{"ranking.softtimeout.enable": "false",
          "timeout": timeout},
        **kwargs)
    failed_queries = [idx for idx, x in enumerate(query_responses) if x.status_code != 200]
    count = 0
    while len(failed_queries) > 0:
        query_batch = [flat_labeled_data[idx][0] for idx in failed_queries]
        retry_query_responses = app.query_batch(
            query_batch=query_batch,
            query_model = model, 
            **{"ranking.softtimeout.enable": "false",
              "timeout": timeout},           
            **kwargs
        )
        for idx, query_response_idx in enumerate(failed_queries):
            query_responses[query_response_idx] = retry_query_responses[idx]
        failed_queries = [idx for idx, x in enumerate(query_responses) if x.status_code != 200]
        count+=1
        if count>=3: break   
    return query_responses
    
def evaluate(
    app: Vespa,  # Connection to a Vespa application.
    labeled_data: Union[List[Dict], DataFrame],  # Data containing query, query_id and relevant docs. See examples below for format.
    eval_metrics: List[EvalMetric],  # Evaluation metrics
    query_model: Union[QueryModel, List[QueryModel]],  # Query models to be evaluated
    id_field: str,  # The Vespa field representing the document id.
    default_score: int = 0,  # Score to assign to the additional documents that are not relevant.
    detailed_metrics=False,  # Return intermediate computations if available. 
    per_query=False,  # Set to True to return evaluation metrics per query.
    aggregators=None,  # Used only if `per_query=False`. List of pandas friendly aggregators to summarize per model metrics. We use ["mean", "median", "std"] by default.
    timeout=1000,  # Vespa query timeout in ms.
    **kwargs,  # Extra keyword arguments to be included in the Vespa Query.
) -> DataFrame:  # Returns query_id and metrics according to the selected evaluation metrics.
    "Evaluate a `QueryModel` according to a list of `EvalMetric`."
    
    if isinstance(labeled_data, DataFrame):
        labeled_data = _parse_labeled_data(df=labeled_data)

    if isinstance(query_model, QueryModel):
        query_model = [query_model]

    model_names = [model.name for model in query_model]
    assert len(model_names) == len(
        set(model_names)
    ), "Duplicate model names. Choose unique model names."

    evaluation = []

    for model in query_model:
        flat_labeled_data = [(x["query"], x["query_id"], x["relevant_docs"]) for x in labeled_data]
        query_responses = _evaluate_query_retry(app, flat_labeled_data, model, timeout, **kwargs)
        failed_queries = [idx for idx, x in enumerate(query_responses) if x.status_code != 200]
        if len(failed_queries) > 0:
            print(f"Failed queries for query model {model.name}: {len(failed_queries)}/{len(query_responses)}")
        timedout_queries = [idx for idx, x in enumerate(query_responses) if x.json.get("root", {}).get("errors", None) is not None]
        if len(timedout_queries) > 0:
            print(f"Timeout queries for query model {model.name}: {len(timedout_queries)}/{len(query_responses)}")
            
        for ((query, query_id, relevant_docs), query_response) in zip(flat_labeled_data, query_responses):
            evaluation_query = {"model": model.name, "query_id": query_id}
            for evaluator in eval_metrics:
                evaluation_query.update(
                    evaluator.evaluate_query(
                        query_response,
                        relevant_docs,
                        id_field,
                        default_score,
                        detailed_metrics,
                    )
                )
            evaluation.append(evaluation_query)
    evaluation = DataFrame.from_records(evaluation)
    if not per_query:
        if not aggregators:
            aggregators = ["mean", "median", "std"]
        evaluation = (
            evaluation[[x for x in evaluation.columns if x != "query_id"]]
            .groupby(by="model")
            .agg(aggregators)
            .T
        )
    return evaluation

Usage:

Setup and feed a Vespa application:

In [None]:
from learntorank.passage import create_basic_search_package
from learntorank.passage import PassageData
from vespa.deployment import VespaDocker

In [None]:
#|output: false
app_package = create_basic_search_package(name="EvaluationApp")
vespa_docker = VespaDocker(port=8082, cfgsrv_port=19072)
app = vespa_docker.deploy(application_package=app_package)
data = PassageData.load()
responses = app.feed_df(
    df=data.get_corpus(), 
    include_id=True, 
    id_field="doc_id"
)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Waiting for application status, 0/300 seconds...
Waiting for application status, 5/300 seconds...
Waiting for application status, 10/300 seconds...
Waiting for application status, 15/300 seconds...
Waiting for application status, 20/300 seconds...
Waiting for application status, 25/300 seconds...
Waiting for application status, 30/300 seconds...
Waiting for application status, 35/300 seconds...
Finished deployment.
Successful documents fed: 1000/1000.
Batch progress: 1/1.


Define query models to be evaluated:

In [None]:
from learntorank.query import OR, Ranking

In [None]:
bm25_query_model = QueryModel(
    name="bm25", 
    match_phase=OR(), 
    ranking=Ranking(name="bm25")
)
native_query_model = QueryModel(
    name="native_rank", 
    match_phase=OR(), 
    ranking=Ranking(name="native_rank")
)

Define metrics to compute during evaluation:

In [None]:
metrics = [
    Recall(at=10), 
    ReciprocalRank(at=3), 
    NormalizedDiscountedCumulativeGain(at=3)
]

Get labeled data:

In [None]:
labeled_data = data.get_labels(type="dev")
labeled_data[0:2]

[{'query_id': '1101971',
  'query': 'why say the sky is the limit',
  'relevant_docs': [{'id': '7407715', 'score': 1}]},
 {'query_id': '712898',
  'query': 'what is an cvc in radiology',
  'relevant_docs': [{'id': '7661336', 'score': 1}]}]

Evaluate:

In [None]:
evaluation = evaluate(
    app=app,
    labeled_data=labeled_data, 
    eval_metrics=metrics, 
    query_model=[native_query_model, bm25_query_model], 
    id_field="doc_id",
)
evaluation

Unnamed: 0,model,bm25,native_rank
recall_10,mean,0.935833,0.845833
recall_10,median,1.0,1.0
recall_10,std,0.215444,0.342749
reciprocal_rank_3,mean,0.935,0.746667
reciprocal_rank_3,median,1.0,1.0
reciprocal_rank_3,std,0.231977,0.399551
ndcg_3,mean,0.912839,0.740814
ndcg_3,median,1.0,1.0
ndcg_3,std,0.242272,0.387611


In [None]:
#|hide
test_eq(evaluation.shape, (9,2))

In [None]:
#|hide
labeled_df = DataFrame.from_records(
    [
        {
            "qid": str(q["query_id"]), 
            "query": str(q["query"]), 
            "doc_id": str(d["id"]), 
            "relevance": int(d["score"])
        } for q in labeled_data for d in q["relevant_docs"]
    ]
)

The evaluate function also accepts labeled data as a data frame:

In [None]:
labeled_df.head()

Unnamed: 0,qid,query,doc_id,relevance
0,1101971,why say the sky is the limit,7407715,1
1,712898,what is an cvc in radiology,7661336,1
2,154469,dmv california how long does it take to get id,7914544,1
3,930015,what's an epigraph,7928705,1
4,860085,what is va tax,2915383,1


In [None]:
evaluation_df = evaluate(
    app=app,
    labeled_data=labeled_df, 
    eval_metrics=metrics, 
    query_model=[native_query_model, bm25_query_model], 
    id_field="doc_id",
)
evaluation_df

Unnamed: 0,model,bm25,native_rank
recall_10,mean,0.935833,0.845833
recall_10,median,1.0,1.0
recall_10,std,0.215444,0.342749
reciprocal_rank_3,mean,0.935,0.746667
reciprocal_rank_3,median,1.0,1.0
reciprocal_rank_3,std,0.231977,0.399551
ndcg_3,mean,0.912839,0.740814
ndcg_3,median,1.0,1.0
ndcg_3,std,0.242272,0.387611


In [None]:
#|hide
test_eq(evaluation, evaluation_df)

Control which aggregators are computed:

In [None]:
evaluation = evaluate(
    app=app,
    labeled_data=labeled_data, 
    eval_metrics=metrics, 
    query_model=[native_query_model, bm25_query_model], 
    id_field="doc_id",
    aggregators=["mean", "std"]
)
evaluation

Unnamed: 0,model,bm25,native_rank
recall_10,mean,0.935833,0.845833
recall_10,std,0.215444,0.342749
reciprocal_rank_3,mean,0.935,0.746667
reciprocal_rank_3,std,0.231977,0.399551
ndcg_3,mean,0.912839,0.740814
ndcg_3,std,0.242272,0.387611


In [None]:
#|hide
test_eq(evaluation.shape, (6,2))

Include detailed metrics when available, this includes intermediate steps that are available for some of the metrics:

In [None]:
evaluation = evaluate(
    app=app,
    labeled_data=labeled_data, 
    eval_metrics=metrics, 
    query_model=[native_query_model, bm25_query_model], 
    id_field="doc_id",
    aggregators=["mean", "std"],
    detailed_metrics=True
)
evaluation

Unnamed: 0,model,bm25,native_rank
recall_10,mean,0.935833,0.845833
recall_10,std,0.215444,0.342749
reciprocal_rank_3,mean,0.935,0.746667
reciprocal_rank_3,std,0.231977,0.399551
ndcg_3,mean,0.912839,0.740814
ndcg_3,std,0.242272,0.387611
ndcg_3_ideal_dcg,mean,1.054165,1.054165
ndcg_3_ideal_dcg,std,0.207315,0.207315
ndcg_3_dcg,mean,0.938928,0.765474
ndcg_3_dcg,std,0.225533,0.387161


In [None]:
#|hide
test_eq(evaluation.shape, (10,2))

Generate results per query:

In [None]:
evaluation = evaluate(
    app=app,
    labeled_data=labeled_data, 
    eval_metrics=metrics, 
    query_model=[native_query_model, bm25_query_model], 
    id_field="doc_id",
    per_query=True
)
evaluation.head()

Unnamed: 0,model,query_id,recall_10,reciprocal_rank_3,ndcg_3
0,native_rank,1101971,1.0,1.0,1.0
1,native_rank,712898,0.0,0.0,0.0
2,native_rank,154469,1.0,0.0,0.0
3,native_rank,930015,1.0,0.0,0.0
4,native_rank,860085,0.0,0.0,0.0


In [None]:
#|hide
test_eq(evaluation.shape, (200,5))

In [None]:
#|hide
vespa_docker.container.stop(timeout=600)
vespa_docker.container.remove()

In [None]:
#|hide
from learntorank.query import WeakAnd, ANN, Union, Ranking, QueryRankingFeature
from random import random

In [None]:
#|hide
#
# Connect to a running Vespa Application
#
app = Vespa(url="https://api.cord19.vespa.ai")
#
# Define a query model
#
match_phase = Union(
    WeakAnd(hits=10),
    ANN(
        doc_vector="title_embedding",
        query_vector="title_vector",
        hits=10,
        label="title",
    ),
)
ranking = Ranking(name="bm25", list_features=True)
query_model = QueryModel(
    name="ANN_bm25",
    query_properties=[
        QueryRankingFeature(
            name="title_vector",
            mapping=lambda x: [random() for x in range(768)],
        )
    ],
    match_phase=match_phase,
    ranking=ranking,
)
#
# Define labelled data
#
labeled_data = [
    {
        "query_id": 0,
        "query": "Intrauterine virus infections and congenital heart disease",
        "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}],
    },
    {
        "query_id": 1,
        "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
        "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}],
    },
]
# equivalent data in df format
labeled_data_df = DataFrame(
    data={
        "qid": [0, 0, 1, 1],
        "query": ["Intrauterine virus infections and congenital heart disease"]
        * 2
        + [
            "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus"
        ]
        * 2,
        "doc_id": [0, 3, 1, 5],
        "relevance": [1, 1, 1, 1],
    }
)

In [None]:
#|hide
eval_metrics = [
    MatchRatio(), 
    Recall(at=10), 
    ReciprocalRank(at=10)
]
ltr_evaluation = evaluate(
    app=app, 
    labeled_data=labeled_data, 
    eval_metrics=eval_metrics, 
    query_model=query_model, 
    id_field="id"
)
test_eq(ltr_evaluation.shape, (9, 1))

In [None]:
#|hide
test_fail(
    evaluate, kwargs={
        "app": app, 
        "labeled_data": labeled_data,
        "eval_metrics": eval_metrics,
        "query_model": [QueryModel(), QueryModel(), query_model],
        "id_field": "id"
    }
)

In [None]:
#|hide
evaluation = evaluate(
    app=app,
    labeled_data=labeled_data,
    eval_metrics=eval_metrics,
    query_model=[QueryModel(), query_model],
    id_field="id",
)
test_eq(evaluation.shape, (9, 2))

In [None]:
#|hide
evaluation = evaluate(
    app=app,
    labeled_data=labeled_data_df,
    eval_metrics=eval_metrics,
    query_model=query_model,
    id_field="id",
    detailed_metrics=True,
)
test_eq(evaluation.shape, (15, 1))

In [None]:
#|hide
evaluation = evaluate(
    app=app,
    labeled_data=labeled_data_df,
    eval_metrics=eval_metrics,
    query_model=query_model,
    id_field="id",
    detailed_metrics=True,
    per_query=True,
)
test_eq(evaluation.shape, (2, 7))

## Evaluate specific query

In [None]:
#|export
def evaluate_query(
    app: Vespa,  # Connection to a Vespa application.
    eval_metrics: List[EvalMetric],  # Evaluation metrics
    query_model: QueryModel,  # Query model to be evaluated  
    query_id: str,  # Query id represented as str.
    query: str,  # Query string.
    id_field: str,  # The Vespa field representing the document id.
    relevant_docs: List[Dict],  # Each dict contains a doc id a optionally a doc score.
    default_score: int = 0,  # Score to assign to the additional documents that are not relevant.
    detailed_metrics=False,  # Return intermediate computations if available.
    **kwargs,  # Extra keyword arguments to be included in the Vespa Query.
) -> Dict:  # Contains query_id and metrics according to the selected evaluation metrics.
    "Evaluate a single query according to evaluation metrics"
    
    query_results = send_query(
        app=app, 
        query=query, 
        query_model=query_model, 
        **kwargs
    )
    evaluation = {"model": query_model.name, "query_id": query_id}
    for evaluator in eval_metrics:
        evaluation.update(
            evaluator.evaluate_query(
                query_results,
                relevant_docs,
                id_field,
                default_score,
                detailed_metrics,
            )
        )
    return evaluation


Usage:

In [None]:
app = Vespa(url = "https://api.cord19.vespa.ai")
query_model = QueryModel(
    match_phase = OR(),
    ranking = Ranking(name="bm25", list_features=True))

Evaluate a single query:

In [None]:
query_evaluation = evaluate_query(
    app=app,
    eval_metrics = eval_metrics, 
    query_model = bm25_query_model, 
    query_id = "0", 
    query = "Intrauterine virus infections and congenital heart disease", 
    id_field = "id",
    relevant_docs = [{"id": 0, "score": 1}, {"id": 3, "score": 1}],
    default_score = 0
)
query_evaluation

{'model': 'bm25',
 'query_id': '0',
 'match_ratio': 0.814424921006077,
 'recall_10': 0.0,
 'reciprocal_rank_10': 0}

## Evaluate query under specific document ids

Use `recall` to specify which documents should be included in the evaluation.

In the example below, we include documents with id equal to 0, 1 and 2. Since the relevant documents for this query are the documents with id 0 and 3, we should get recall equal to 0.5.

In [None]:
query_evaluation = evaluate_query(
    app=app,
    eval_metrics = eval_metrics, 
    query_model = query_model, 
    query_id = 0, 
    query = "Intrauterine virus infections and congenital heart disease", 
    id_field = "id",
    relevant_docs = [{"id": 0, "score": 1}, {"id": 3, "score": 1}],
    default_score = 0,
    recall = ("id", [0, 1, 2])
)
query_evaluation

{'model': 'default_name',
 'query_id': 0,
 'match_ratio': 9.70242657688688e-06,
 'recall_10': 0.5,
 'reciprocal_rank_10': 1.0}

We now include documents with id equal to 0, 1, 2 and 3. This should give a recall equal to 1.

In [None]:
query_evaluation = evaluate_query(
    app=app,
    eval_metrics = eval_metrics, 
    query_model = query_model, 
    query_id = 0, 
    query = "Intrauterine virus infections and congenital heart disease", 
    id_field = "id",
    relevant_docs = [{"id": 0, "score": 1}, {"id": 3, "score": 1}],
    default_score = 0,
    recall = ("id", [0, 1, 2, 3])
)
query_evaluation

{'model': 'default_name',
 'query_id': 0,
 'match_ratio': 1.2936568769182506e-05,
 'recall_10': 1.0,
 'reciprocal_rank_10': 1.0}

In [None]:
#|hide
test_eq(query_evaluation["recall_10"], 1.0)

In [None]:
#|hide
nbdev_export()