## Setup

In [None]:
!pip install llama-index -qq
!pip install -qq RAGatouille
!pip install ftfy -qq

import sqlite3
import json
import re
import os
import pandas as pd, numpy as np
import requests
from fastcore.test import is_close
from ftfy import fix_text
from ragatouille.data import CorpusProcessor
from llama_index.core.text_splitter import SentenceSplitter

corpus_processor = CorpusProcessor()

Make sure to download [utils.py](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/utils.py) and save it locally.

In [None]:
from utils import *

## Load Data and Run Search

In [28]:
nbs = download_data()
nbs

{'1': '01_intro.ipynb',
 '2': '02_production.ipynb',
 '4': '04_mnist_basics.ipynb',
 '8': '08_collab.ipynb',
 '9': '09_tabular.ipynb',
 '10': '10_nlp.ipynb',
 '13': '13_convolutions.ipynb'}

In [29]:
data = get_data(nbs)
benchmark = load_benchmark()
kw_df = load_keywords()

In [32]:
db_path = 'fastbook.db'
chunk_size = 500

In [34]:
delete_db()
for chapter, text in data.items():
    documents = process_documents(text, chunk_size=chunk_size)
    assert load_data(documents=documents, db_path=db_path, chapter=chapter)

In [35]:
results = full_text_search(kw_df, limit=10)
assert len(results) == 191

## Defining Successful Retrieval

In [36]:
question = benchmark['questions'][0]
question

{'chapter': 1,
 'question_number': 1,
 'question_text': 'Do you need these for deep learning?\n\n- Lots of math T / F\n   - Lots of data T / F\n   - Lots of expensive computers T / F\n   - A PhD T / F',
 'gold_standard_answer': '"Lots of math - False\nLots of data - False\nLots of expensive computers - False\nA PhD - False"',
 'answer_context': [{'answer_component': '"Lots of math - False\nLots of data - False\nLots of expensive computers - False\nA PhD - False"',
   'scoring_type': 'simple',
   'explicit_context': 'true',
   'extraneous_answer': 'false'}],
 'question_context': []}

In [37]:
question['answer_context'][0]

{'answer_component': '"Lots of math - False\nLots of data - False\nLots of expensive computers - False\nA PhD - False"',
 'scoring_type': 'simple',
 'explicit_context': 'true',
 'extraneous_answer': 'false'}

In [40]:
ctx = question['answer_context'][0]['context'][0]
ctx



In [39]:
passage = results[0][1]
passage



In [41]:
fix_text(ctx) in fix_text(passage)

True

In [42]:
fix_text(ctx) in fix_text(results[0][2])

False

## Modified MRR@k

In [48]:
def modified_mrr(question, results, cutoff=10):
    retrieved_passages = results[:cutoff]
    highest_rank = 0
    for ac in question["answer_context"]:
        ctxs = ac.get("context", [])
        component_answered = False

        for rank, passage in enumerate(retrieved_passages, start=1):
            if any(fix_text(ctx) in fix_text(passage) for ctx in ctxs):
                highest_rank = max(highest_rank, rank)
                component_answered = True
                break

        if not component_answered: return 0.0

    return 1.0/highest_rank if highest_rank > 0 else 0.0

In [49]:
assert modified_mrr(question, results[0]) == 0.5

In [50]:
assert modified_mrr(benchmark['questions'][1], results[0]) == 0

## Modified Recall@k

In [51]:
def modified_recall(question, results, cutoff=10):
    retrieved_passages = results[:cutoff]
    components_found = []
    for ac in question["answer_context"]:
        ctxs = ac.get("context", [])
        found = False

        for rank, passage in enumerate(retrieved_passages, start=1):
            if any(fix_text(ctx) in fix_text(passage) for ctx in ctxs):
                found = True
                break

        components_found.append(found)


    return sum(components_found) / len(components_found)

In [52]:
assert modified_recall(question, results[0]) == 1

In [53]:
assert modified_recall(benchmark['questions'][1], results[0]) == 0

## Scoring Retrieval Results

In [54]:
def score_retrieval(benchmark, results):
    mrrs = []
    recalls = []

    for i,q in enumerate(benchmark['questions']):
        mrr = modified_mrr(q, results[i])
        recall = modified_recall(q, results[i])
        mrrs.append(mrr)
        recalls.append(recall)

    assert len(mrrs) == len(benchmark['questions'])
    assert len(recalls) == len(benchmark['questions'])

    mrrs = pd.Series(mrrs)
    recalls = pd.Series(recalls)

    return mrrs, recalls

In [56]:
mrrs, recalls = score_retrieval(benchmark, results)

The assertion values below come from [these manual validation results](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/2024-12-13-fastbook-benchmark-results-MANUAL%20VALIDATION.xlsx) obtained by running [this notebook](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/2024_12_13_fastbook_benchmark_results.ipynb).

In [57]:
assert is_close(mrrs.sum(), 96.54365)
assert is_close(mrrs.mean(), 0.50546)
assert is_close(recalls.sum(), 163.94166)
assert is_close(recalls.mean(), 0.858333)