In [10]:
import pandas as pd

corpus = [
    "Apple Apple Banana",
    "Banana Mango Banana",
    "Cherry Cherry Strawberries",
    "Grapes Grapes Strawberries Grapes",
    "Apple Banana Mango",
    "Blueberries Strawberries Apple",
    "Apple Banana Mango",
    "Grapes Grapes Grapes",
    "Blueberries Apple Strawberries",
    "Apple Banana Apple",
    "Cherry Cherry Mango Cherry",
    "Blueberries Strawberries Cherry",
]

dataset = [
    {"query": "apple banana", "relevant_doc_ids": [0, 4, 6, 9]},
    {"query": "grapes", "relevant_doc_ids": [3, 7]},
    {"query": "banana mango", "relevant_doc_ids": [1, 4, 6, 10]},
    {"query": "Cherry", "relevant_doc_ids": [2, 10, 11]},
    {"query": "apple", "relevant_doc_ids": [0, 4, 6, 8, 9]},
    {"query": "Blueberries Strawberries", "relevant_doc_ids": [5, 8, 11]}
]

#simulation of retrieved documents
retrieved_docs = [
    [0, 9, 6, 11, 3],
    [2, 5, 4, 9, 10], # all wrong
    [2, 4, 1, 10, 9], # first and third one is incorrect
    [2, 5, 11, 10, 7], # middle one is incorrect
    [0, 6, 4, 8, 10], # last one is incorrect, 6 and 4 swapped
    [8, 5, 11, 7, 2] # all correct
]

df = pd.DataFrame(dataset)
df['retrieved_doc_ids'] = retrieved_docs
df

Unnamed: 0,query,relevant_doc_ids,retrieved_doc_ids
0,apple banana,"[0, 4, 6, 9]","[0, 9, 6, 11, 3]"
1,grapes,"[3, 7]","[2, 5, 4, 9, 10]"
2,banana mango,"[1, 4, 6, 10]","[2, 4, 1, 10, 9]"
3,Cherry,"[2, 10, 11]","[2, 5, 11, 10, 7]"
4,apple,"[0, 4, 6, 8, 9]","[0, 6, 4, 8, 10]"
5,Blueberries Strawberries,"[5, 8, 11]","[8, 5, 11, 7, 2]"


In [11]:
K = 3

def success(relevant_doc_ids: list[list[int]], retrieved_doc_ids: list[list[int]], K):
    """ 
    is there any doc of y_pred in y[:K]  
    """
    result = [0] * len(relevant_doc_ids)
    for i in range(len(relevant_doc_ids)):
        for j in retrieved_doc_ids[i]:
            if j in relevant_doc_ids[i][:K]:
                result[i] = 1
                break

    return result

result = success(df['relevant_doc_ids'], df['retrieved_doc_ids'], K)
print(f"Result: {result}")

df[f"success@{K}"] = result
df


Result: [1, 0, 1, 1, 1, 1]


Unnamed: 0,query,relevant_doc_ids,retrieved_doc_ids,success@3
0,apple banana,"[0, 4, 6, 9]","[0, 9, 6, 11, 3]",1
1,grapes,"[3, 7]","[2, 5, 4, 9, 10]",0
2,banana mango,"[1, 4, 6, 10]","[2, 4, 1, 10, 9]",1
3,Cherry,"[2, 10, 11]","[2, 5, 11, 10, 7]",1
4,apple,"[0, 4, 6, 8, 9]","[0, 6, 4, 8, 10]",1
5,Blueberries Strawberries,"[5, 8, 11]","[8, 5, 11, 7, 2]",1


In [14]:
from numpy import mean

def mean_reciprocal_rank(relevant_doc_ids: list[list[int]], retrieved_doc_ids: list[list[int]], K):
    """
    mean of (1 / rank of first relevant doc) 
    """
    result = [0] * len(relevant_doc_ids)
    for i in range(len(relevant_doc_ids)):
        first_doc_id = relevant_doc_ids[i][0]
        for j, doc_id in enumerate(retrieved_doc_ids[i][:K]):
            if doc_id == first_doc_id:
                result[i] = 1/(j+1)

    return result, round(float(mean(result)), 3)


result, mrr = mean_reciprocal_rank(df['relevant_doc_ids'], df['retrieved_doc_ids'], K)
print(f"Result: {result}\nMRR: {mrr}")

df[f"mrr@{K}"] = result
df

Result: [1.0, 0, 0.3333333333333333, 1.0, 1.0, 0.5]
MRR: 0.639


Unnamed: 0,query,relevant_doc_ids,retrieved_doc_ids,success@3,mrr@3
0,apple banana,"[0, 4, 6, 9]","[0, 9, 6, 11, 3]",1,1.0
1,grapes,"[3, 7]","[2, 5, 4, 9, 10]",0,0.0
2,banana mango,"[1, 4, 6, 10]","[2, 4, 1, 10, 9]",1,0.333333
3,Cherry,"[2, 10, 11]","[2, 5, 11, 10, 7]",1,1.0
4,apple,"[0, 4, 6, 8, 9]","[0, 6, 4, 8, 10]",1,1.0
5,Blueberries Strawberries,"[5, 8, 11]","[8, 5, 11, 7, 2]",1,0.5


In [None]:
def precision(relevant_doc_ids: list[list[int]], retrieved_doc_ids: list[list[int]], K):
    """ 
    no. of relevant docs in top K retrieved docs / K
    intuition: precision is high when all the retrieved docs are relevent
    """
    result = [0] * len(relevant_doc_ids)
    for i in range(len(relevant_doc_ids)):
        rel_docs_count = 0
        for doc_id in retrieved_doc_ids[i][:K]:
            if doc_id in relevant_doc_ids[i]:
                rel_docs_count += 1
        result[i] = rel_docs_count/K

    return result, round(float(mean(result)), 3)


result, prec = precision(df['relevant_doc_ids'], df['retrieved_doc_ids'], K)
print(f"Result: {result}\nPrecision: {prec}")

df[f"precision@{K}"] = result
df

Result: [1.0, 0.0, 0.6666666666666666, 0.6666666666666666, 1.0, 1.0]
Precision: 0.722


Unnamed: 0,query,relevant_doc_ids,retrieved_doc_ids,success@3,mrr@3,precision@3
0,apple banana,"[0, 4, 6, 9]","[0, 9, 6, 11, 3]",1,1.0,1.0
1,grapes,"[3, 7]","[2, 5, 4, 9, 10]",0,0.0,0.0
2,banana mango,"[1, 4, 6, 10]","[2, 4, 1, 10, 9]",1,0.333333,0.666667
3,Cherry,"[2, 10, 11]","[2, 5, 11, 10, 7]",1,1.0,0.666667
4,apple,"[0, 4, 6, 8, 9]","[0, 6, 4, 8, 10]",1,1.0,1.0
5,Blueberries Strawberries,"[5, 8, 11]","[8, 5, 11, 7, 2]",1,0.5,1.0
