## Setup

In [None]:
!pip install sentence-transformers -Uqq
!pip install -qq RAGatouille
!pip install ftfy -qq
!pip install llama-index -qq

In [None]:
import sqlite3
import json
import re
import os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn.functional as F
from fastcore.test import is_close
from ftfy import fix_text
from sentence_transformers import SentenceTransformer
from ragatouille.data import CorpusProcessor
from llama_index.core.text_splitter import SentenceSplitter

corpus_processor = CorpusProcessor()
emb_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

Make sure to download [utils.py](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/utils.py) and save it locally.

In [3]:
from utils import *

### Load Data

In [4]:
nbs = download_data()

In [5]:
nbs

{'1': '01_intro.ipynb',
 '2': '02_production.ipynb',
 '4': '04_mnist_basics.ipynb',
 '8': '08_collab.ipynb',
 '9': '09_tabular.ipynb',
 '10': '10_nlp.ipynb',
 '13': '13_convolutions.ipynb'}

In [6]:
benchmark = load_benchmark()
data = get_data(nbs)

## Embed Questions and Documents

In [7]:
questions = {}
for q in benchmark['questions']: questions.setdefault(str(q['chapter']), []).append(q["question_text"].strip('"\''))

In [8]:
def prep_questions(benchmark):
    questions = {}
    for q in benchmark['questions']: questions.setdefault(str(q['chapter']), []).append(q["question_text"].strip('"\''))
    return questions

In [9]:
questions = prep_questions(benchmark)

In [10]:
qs_sum = 0
for c, qs in questions.items():
    print(c, len(qs))
    qs_sum += len(qs)
assert qs_sum == 191

1 30
2 26
4 31
8 23
9 27
10 20
13 34


In [11]:
# embed questions
qs = questions['1']
qs_embs = emb_model.encode(qs, convert_to_tensor=True)
qs_embs.shape

torch.Size([30, 384])

In [12]:
# prep data
chunk_size = 500
documents = process_documents(data['1'], chunk_size=chunk_size)
assert len(documents) == 57

In [13]:
# embed documents
data_embs = emb_model.encode(documents, convert_to_tensor=True)
data_embs.shape

torch.Size([57, 384])

## Top-k Documents by Cosine Similarity

In [14]:
qs_embs.unsqueeze(1).shape, data_embs.unsqueeze(0).shape

(torch.Size([30, 1, 384]), torch.Size([1, 57, 384]))

In [15]:
# compute cosine similarity
idxs = F.cosine_similarity(qs_embs.unsqueeze(1), data_embs.unsqueeze(0), dim=2).sort(descending=True)[1]
idxs.shape

torch.Size([30, 57])

In [16]:
topk = 10
topk

10

In [17]:
top_k_idxs = idxs[:, :topk]
top_k_idxs.shape

torch.Size([30, 10])

In [18]:
_ = [row_idxs for row_idxs in top_k_idxs]
len(_)

30

In [19]:
_[0]

tensor([ 0, 41, 10, 13,  9,  5, 42, 11, 43,  6], device='cuda:0')

In [20]:
top_k_chunks = [[documents[idx.item()] for idx in row_idxs] for row_idxs in top_k_idxs]
len(top_k_chunks)

30

In [22]:
len(top_k_chunks[0])

10

In [23]:
for o in top_k_chunks: assert len(o) <= topk

## Single Vector Search for all Chapters

In [24]:
questions = prep_questions(benchmark)

In [25]:
qs_embs = {}
for chapter, qs in questions.items():
    qs_embs[chapter] = emb_model.encode(qs, convert_to_tensor=True)
    print(c, qs_embs[chapter].shape)

13 torch.Size([30, 384])
13 torch.Size([26, 384])
13 torch.Size([31, 384])
13 torch.Size([23, 384])
13 torch.Size([27, 384])
13 torch.Size([20, 384])
13 torch.Size([34, 384])


In [26]:
data_embs = {}
all_docs = {}

for chapter, text in data.items():
    documents = process_documents(text, chunk_size=chunk_size)
    embs = emb_model.encode(documents, convert_to_tensor=True)

    all_docs[chapter] = documents
    data_embs[chapter] = embs

In [27]:
def single_vector_retrieval(nbs, all_docs, data_embs, qs_embs, topk=10):
    results = []
    for chapter in nbs.keys():
        idxs = F.cosine_similarity(qs_embs[chapter].unsqueeze(1), data_embs[chapter].unsqueeze(0), dim=2).sort(descending=True)[1]
        top_k_idxs = idxs[:, :topk]
        top_k_chunks = [[all_docs[chapter][idx.item()] for idx in row_idxs] for row_idxs in top_k_idxs]
        results.extend(top_k_chunks)

    assert len(results) == 191
    for res in results: assert len(res) <= topk
    return results

In [28]:
results = single_vector_retrieval(nbs, all_docs, data_embs, qs_embs)

In [29]:
mrrs, recalls = score_retrieval(benchmark, results)

The assertion values below come from [these manual validation results](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/2024-12-13-fastbook-benchmark-results-MANUAL%20VALIDATION.xlsx) obtained by running [this notebook](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/2024_12_13_fastbook_benchmark_results.ipynb).

In [30]:
assert is_close(mrrs.sum(), 83.33968)
assert is_close(mrrs.mean(), 0.43633)
assert is_close(recalls.sum(), 154.38333)
assert is_close(recalls.mean(), 0.80828)