## Setup

In [None]:
!pip install -qq RAGatouille
!pip install ftfy -qq
!pip install llama-index -qq

In [2]:
import sqlite3
import json
import re
import os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn.functional as F
from fastcore.test import is_close
from ftfy import fix_text
from ragatouille import RAGPretrainedModel
from ragatouille.data import CorpusProcessor

corpus_processor = CorpusProcessor()

Make sure to download [utils.py](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/utils.py) and save it locally.

In [3]:
from utils import *

### Load Data

In [4]:
nbs = download_data()
nbs

{'1': '01_intro.ipynb',
 '2': '02_production.ipynb',
 '4': '04_mnist_basics.ipynb',
 '8': '08_collab.ipynb',
 '9': '09_tabular.ipynb',
 '10': '10_nlp.ipynb',
 '13': '13_convolutions.ipynb'}

In [5]:
benchmark = load_benchmark()
data = get_data(nbs)
questions = prep_questions(benchmark)

## Index-Free Retrieval

In [6]:
chunk_size = 500
chapter = '1'

In [7]:
documents = process_documents(data[chapter], chunk_size=chunk_size)
assert len(documents) == 57

In [None]:
model_nm = "colbert-ir/colbertv2.0"
RAG = RAGPretrainedModel.from_pretrained(model_nm)

In [9]:
RAG.encode(documents, document_metadatas=[{"chapter": chapter} for _ in range(len(documents))])

Encoding 57 documents...


  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
100%|██████████| 2/2 [00:01<00:00,  1.35it/s]

Shapes:
encodings: torch.Size([57, 508, 128])
doc_masks: torch.Size([57, 508])
Documents encoded!





In [11]:
topk = min(10, len(documents))
topk

10

In [15]:
results = []
for q in questions[chapter]:
    res = RAG.search_encoded_docs(query = q.strip('"\''), k=topk)
    res = [r['content'] for r in res]
    results.append(res)

assert len(results) == 30

  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


In [None]:
results[0]

In [16]:
def index_free_retrieval(nbs, data, questions, model_nm = "colbert-ir/colbertv2.0", chunk_size=500):
    results = []
    for chapter in nbs.keys():
        chapter_results = []
        RAG = RAGPretrainedModel.from_pretrained(model_nm)
        documents = process_documents(data[chapter], chunk_size=chunk_size)
        RAG.encode(documents, document_metadatas=[{"chapter": chapter} for _ in range(len(documents))])
        topk = min(10, len(documents))
        for q in questions[chapter]:
            res = RAG.search_encoded_docs(query = q.strip('"\''), k=topk)
            res = [r['content'] for r in res]
            chapter_results.append(res)
        results.extend(chapter_results)

    assert len(results) == 191
    for res in results: assert len(res) <= topk
    return results

In [None]:
results = index_free_retrieval(nbs, data, questions)

In [18]:
mrrs, recalls = score_retrieval(benchmark, results)

The assertion values below come from [these manual validation results](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/2024-12-13-fastbook-benchmark-results-MANUAL%20VALIDATION.xlsx) obtained by running [this notebook](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/2024_12_13_fastbook_benchmark_results.ipynb).

In [19]:
assert is_close(mrrs.sum(), 107.55119)
assert is_close(mrrs.mean(), 0.56309)
assert is_close(recalls.sum(), 166.78333)
assert is_close(recalls.mean(), 0.873211)

### answerai-colbert-small-v1

In [None]:
results = index_free_retrieval(nbs, data, questions, model_nm="answerdotai/answerai-colbert-small-v1")
mrrs, recalls = score_retrieval(benchmark, results)

In [21]:
assert is_close(mrrs.sum(), 109.4246)
assert is_close(mrrs.mean(), 0.5729)
assert is_close(recalls.sum(), 165.38333)
assert is_close(recalls.mean(), 0.86588)