# Evaluation metrics for Retrieval and Recommendation Systems

In [38]:
!pip3 install pytrec_eval



In [None]:
import pytrec_eval
import json

# Precision and recall

Precision and recall are rank-unaware metrics.

$$\text{Precision@K} = \frac{TP}{TP+FP}= \frac{TP}{K}
=\frac{\text{Number of relevant items in K}}{\text{Total number of items in K}}$$

$$\text{Recall@K} = \frac{TP}{TP+FN}
=\frac{\text{Number of relevant items in K}}{\text{Total number of relevant items}}$$

In [39]:
qrel = {
    'sweet pastry' : {
        'donut' : 1,
        'muffin' : 1,
        'scone' : 1, 
    },
    'suitable for lunch' : {
        'sandwich' : 1,
        'bagel' : 1,
        'roll' : 1,
        'pretzel' : 1
    },
    'goes well with jam' : {
        'bagel' : 1,
        'croissant' : 1,
        'roll' : 1,
    },
}

run = {
    'sweet pastry' : {
        'donut' : 0.95,
        'bagel' : 0.9,
        'muffin' : 0.8, 
        'croissant' : 0.7, 
    },
    'suitable for lunch' : {
        'muffin' : 0.95,
        'donut' : 0.9,
        'sandwich' : 0.85, 
        'bagel' : 0.82, 
    },
    'goes well with jam' : {
        'pretzel' : 0.9,
        'bagel' : 0.85,
        'muffin' : 0.7, 
        'donut' : 0.6, 
    },
}

evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, { 'recall.4', 'P.4' })

print(json.dumps(evaluator.evaluate(run), indent=1))

{
 "sweet pastry": {
  "P_4": 0.5,
  "recall_4": 0.6666666666666666
 },
 "suitable for lunch": {
  "P_4": 0.5,
  "recall_4": 0.5
 },
 "goes well with jam": {
  "P_4": 0.25,
  "recall_4": 0.3333333333333333
 }
}


# Mean Average Precision (MAP) 

$$\text{MAP@K} = \frac{1}{U}\sum_{u=1}^{U}AP@K_u$$


In [40]:
qrel = {
    'sweet pastry' : {
        'donut' : 1,
        'muffin' : 1,
    },
}

run = {
    'sweet pastry' : {
        'donut' : 0.95,
        'bagel' : 0.9,
        'muffin' : 0.8, 
        'croissant' : 0.7, 
    },
}

evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, { 'map_cut.4', })

print(json.dumps(evaluator.evaluate(run), indent=1))

{
 "sweet pastry": {
  "map_cut_4": 0.8333333333333333
 }
}


Note that `pytrec_eval` seems to average MAP across the number of relevant items. See, how the `MAP@K` value changes, when we add `'scone'` to the list of relevant items?

In [41]:
qrel = {
    'sweet pastry' : {
        'donut' : 1,
        'muffin' : 1,
        'scone' :1,
    },
}

run = {
    'sweet pastry' : {
        'donut' : 0.95,
        'bagel' : 0.9,
        'muffin' : 0.8, 
        'croissant' : 0.7, 
    },
}

evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, { 'map_cut.4', })

print(json.dumps(evaluator.evaluate(run), indent=1))

{
 "sweet pastry": {
  "map_cut_4": 0.5555555555555555
 }
}


# Normalized Discounted Cumulative Gain (NDCG)

$$\text{NDCG@K} = \frac{\text{DCG@K}}{\text{IDCG@K}}$$

In [43]:
qrel = {
    'goes well with jam' : {
        'bagel' : 2,
        'croissant' : 2,
        'roll' : 2,
        'scone' : 1,
        'muffin' : 1,
        'donut' : 1,
    },
}

run = {
    'goes well with jam' : {
        'pretzel' : 0.9,
        'bagel' : 0.85,
        'muffin' : 0.7, 
        'donut' : 0.6, 
    },
}

evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, { 'ndcg_cut.4', })

print(json.dumps(evaluator.evaluate(run), indent=1))

{
 "goes well with jam": {
  "ndcg_cut_4": 0.4672390440360399
 }
}
