
# Image Retrieval Performance Evaluation Notebook for BM25
__Author__: Vibhor Jain

__Description__: This Notebook is dedicated for Image Retrieval performance evaluation for BM25. More about BM25 [here](https://en.wikipedia.org/wiki/Okapi_BM25)

__Dated__: 29th Apr 2020

Version: 1.0

In [2]:
import numpy as np
from rank_bm25 import BM25Okapi
import pandas as pd

In [12]:
from utils import display_helper

In [13]:
# this function extracts the image names from the mapping list
# and creates a integer list.
def get_expected_indices(expected_idx):
    expected_idx = expected_idx.strip('][').split(',')
    expected_idx = list(map(int, expected_idx))    
    return expected_idx


# this function takes 2 lists (y_true, y_pred) and returns the
# recall for each query along with the image count.
def get_recall_list(expected_idx_list, received_idx_list):
    recall_score_list = []
    relevant_image_count = []
    total_queries = len(expected_idx_list)
    for i in range(total_queries):
        y_true = expected_idx_list[i]
        y_pred = received_idx_list[i]
        recall = len(list(set(y_true) & set(y_pred)))/len(y_true)
        
        recall_score_list.append(recall)
        relevant_image_count.append(len(y_true))
    
    return recall_score_list, relevant_image_count

In [6]:
# loading the captions and the query mapping
captions_df = pd.read_csv('./data/captions.txt')
query_df = pd.read_csv('./data/query_2_img_idx_mapping.txt', sep=';')

In [14]:
# initializing the BM25 model
corpus = captions_df['caption'].tolist()
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [15]:
predicted_idx_list, expected_idx_list = [], []

for count, query in enumerate(query_df['query']):
    
    expected_idx, predicted_idx = [], []
    expected_idx = get_expected_indices(query_df.loc[count][1])
    expected_idx_list.append(expected_idx)
    
    n = len(expected_idx)
    tokenized_query = query.split(" ")
    res = bm25.get_top_n(tokenized_query, corpus, n)
    
    predicted_idx = [display_helper.find_img_idx(captions_df[captions_df['caption'] == res[k]].reset_index()['image_files'][0]) for k in range(n)]
    predicted_idx_list.append(predicted_idx)
    
    

In [16]:
recall_bm25, image_count_bm25 = get_recall_list(expected_idx_list, predicted_idx_list)

In [17]:
prod_bm25 = [a*b for a,b in zip(recall_bm25,image_count_bm25)]

# getting the total and relevant images expected for all queries.
total_images_bm25 = np.sum(image_count_bm25)
total_relevant_images_retrieved_bm25 = np.sum(prod_bm25)

# printing all the metadata
display_helper.display_stats("BM25 pipeline", total_images_bm25, total_relevant_images_retrieved_bm25)

BM25 pipeline
------------------------------------------------

+-----------------------+---------------------------+----------------+
| Total relevant images | Relevant images retrieved | Average Recall |
+-----------------------+---------------------------+----------------+
|          106          |            33.0           |      0.31      |
+-----------------------+---------------------------+----------------+
