In [None]:
!pip install sentence-transformers -Uqq
!pip install -qq RAGatouille
!pip install ftfy -qq

import sqlite3
import json
import re
import os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn.functional as F
from ftfy import fix_text
from sentence_transformers import SentenceTransformer
from ragatouille import RAGPretrainedModel
from ragatouille.data import CorpusProcessor

corpus_processor = CorpusProcessor()
emb_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

In [None]:
urls = {
    '01_intro.ipynb': 'https://drive.google.com/uc?export=view&id=1mmBjFH_plndPBC4iRZHChfMazgBxKK4_',
    '02_production.ipynb': 'https://drive.google.com/uc?export=view&id=1Cf5QHthHy1z13H0iu3qrzAWgquCfqVHk',
    '04_mnist_basics.ipynb': 'https://drive.google.com/uc?export=view&id=113909_BNulzyLIKUNJHdya0Hhoqie30I',
    '08_collab.ipynb': 'https://drive.google.com/uc?export=view&id=1BtvStgFjUtvtqbSZNrL7Y2N-ey3seNZU',
    '09_tabular.ipynb': 'https://drive.google.com/uc?export=view&id=1rHFvwl_l-AJLg_auPjBpNrOgG9HDnfqg',
    '10_nlp.ipynb': 'https://drive.google.com/uc?export=view&id=1pg1pH7jMMElzrXS0kBBz14aAuDsi2DEP',
    '13_convolutions.ipynb': 'https://drive.google.com/uc?export=view&id=19P-eEHpAO3WrOvdxgXckyhHhfv_R-hnS'
}

def download_file(url, filename):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the file in write-binary mode
        with open(filename, 'wb') as file:
            # Write the content of the response to the file
            file.write(response.content)
        print(f"File downloaded successfully: {filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

for fname, url in urls.items():
  download_file(url, fname)

File downloaded successfully: 01_intro.ipynb
File downloaded successfully: 02_production.ipynb
File downloaded successfully: 04_mnist_basics.ipynb
File downloaded successfully: 08_collab.ipynb
File downloaded successfully: 09_tabular.ipynb
File downloaded successfully: 10_nlp.ipynb
File downloaded successfully: 13_convolutions.ipynb


In [None]:
nbs = {
    '1': '01_intro.ipynb',
    '2': '02_production.ipynb',
    '4': '04_mnist_basics.ipynb',
    '8': '08_collab.ipynb',
    '9': '09_tabular.ipynb',
    '10': '10_nlp.ipynb',
    '13': '13_convolutions.ipynb'
}

In [None]:
url = 'https://gist.githubusercontent.com/vishalbakshi/2c22ca69ac7bc4bc845052c1b9d949c8/raw/d498259f2fc75d27c485ddc73933f145987feef3/cs_bm25_baselines.csv'
questions = pd.read_csv(url).query("is_answerable == 1")[["chapter", "question_number", "question_text", "answer", "keywords"]]

# remove double quotations from the question text
# as these affect embeddings/cosine similarity: https://vishalbakshi.github.io/blog/posts/2024-11-08-punctuation-cosine-similarity/
questions['question_text'] = questions['question_text'].str.strip('"\'')
assert questions.shape == (191,5)

In [None]:
download_file(
    "https://gist.githubusercontent.com/vishalbakshi/a507b6e9e893475e93a4141e96b8947d/raw/e32835ba1dbf94384943ed5a65404112e1c89df2/fastbook-benchmark.json",
    "fastbook-benchmark.json"
    )

def load_benchmark():
    # Load the benchmark data
    with open('fastbook-benchmark.json', 'r') as f:
        benchmark = json.load(f)
    return benchmark

benchmark = load_benchmark()
assert len(benchmark['questions']) == 191

File downloaded successfully: fastbook-benchmark.json


In [None]:
def load_data(chunks, db_path, chapter=1):
    try:
        # create virtual table if database doesn't exist
        if not os.path.exists(db_path):
            with sqlite3.connect(db_path) as conn:
              cur = conn.cursor()
              cur.execute("""
              CREATE VIRTUAL TABLE fastbook_text
              USING FTS5(chapter, text);
              """)
              conn.commit()

        # load in the chunks for each chapter
        with sqlite3.connect(db_path) as conn:
            cur = conn.cursor()

            for chunk in chunks:
                cur.execute("INSERT INTO fastbook_text(chapter, text) VALUES (?, ?)", (chapter, chunk))

            conn.commit()
            res = cur.execute("SELECT * FROM fastbook_text WHERE chapter = ?", (chapter,)).fetchall()
        # make sure all the data was loaded into the database
        if len(res) != len(chunks):
            raise ValueError(f"Number of inserted chunks ({len(res)}) doesn't match input chunks ({len(chunks)})")

        return True

    except sqlite3.Error as e:
        print(f"An error occurred: {e}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return False

In [None]:
def db_search(df, limit=1):
  results = []
  with sqlite3.connect('fastbook.db') as conn:
    cur = conn.cursor()
    # concatenate the keywords into a string "keyword1 OR keyword 2 OR keyword3 ..."
    for _, row in df.iterrows():
      keywords = ' OR '.join([f'"{keyword.strip(",")}"' for keyword in row['keywords'].replace('"', '').split()])

      q = f"""
        SELECT text, rank
        FROM fastbook_text
        WHERE fastbook_text MATCH ?
        AND chapter = ?
        ORDER BY rank
        LIMIT ?
        """
      res = cur.execute(q, (keywords, str(row['chapter']), limit)).fetchall()
      # grab the retrieved chunk from the query results
      res = [item[0] for item in res]

      # if there are multiple chunks retrieved, combine them into a single string
      results.append(res)

    return results

In [None]:
def fts_retrieval(data, df, chunk_size):
    if os.path.exists("fastbook.db"):
        os.remove("fastbook.db")

    for chapter, chunks in data.items():
        documents = corpus_processor.process_corpus(chunks, chunk_size=chunk_size)
        documents = [doc['content'] for doc in documents]
        assert load_data(documents, 'fastbook.db', chapter)

    results = db_search(df, limit=10)
    assert len(results) == 191

    for res in results:
        assert len(res) <= 10

    return results, documents

In [None]:
def single_vector_retrieval(data, benchmark, chunk_size):
    # Group questions by chapter
    questions = {}
    for q in benchmark["questions"]:
        chapter = str(q["chapter"])
        if chapter not in questions:
            questions[chapter] = []
        questions[chapter].append(q['question_text'].strip('"\''))

    q_embs = {}
    for chapter, _ in data.items():
        qs = questions[chapter]
        q_embs[chapter] = emb_model.encode(qs, convert_to_tensor=True)

    results = []
    for chapter, chunks in data.items():
        # chunk chapter text
        documents = corpus_processor.process_corpus(chunks, chunk_size=chunk_size)
        documents = [doc['content'] for doc in documents]

        # Embed documents
        data_embs = emb_model.encode(documents, convert_to_tensor=True)

        # Compute cosine similarity and get top 10 indices for each row
        idxs = F.cosine_similarity(q_embs[chapter].unsqueeze(1), data_embs.unsqueeze(0), dim=2).sort(descending=True)[1]
        top_10_idxs = idxs[:, :10]  # Get the top 10 indices for each row

        # Extract top 10 chunks for each row
        top_10_chunks = [
            [documents[idx.item()] for idx in row_idxs]
            for row_idxs in top_10_idxs
        ]
        results.extend(top_10_chunks)

    assert len(results) == 191

    for res in results:
        assert len(res) <= 10

    return results, documents

In [None]:
def index_free_retrieval(data, model_nm, chunk_size, benchmark):
    questions_by_chapter = {}
    for q in benchmark["questions"]:
        chapter = str(q["chapter"])
        if chapter not in questions_by_chapter:
            questions_by_chapter[chapter] = []
        questions_by_chapter[chapter].append(q)

    # Dictionary to store results per chapter
    chapter_results = {}

    # Process each chapter separately
    for chapter in nbs.keys():
        # instantiate new RAG object
        RAG = RAGPretrainedModel.from_pretrained(model_nm)

        # Get questions for this chapter
        chapter_questions = questions_by_chapter[chapter]

        # encode chapter documents
        documents = corpus_processor.process_corpus(data[chapter], chunk_size=chunk_size)
        RAG.encode([x['content'] for x in documents], document_metadatas=[{"chapter": chapter} for _ in range(len(documents))])

        # Perform retrieval for each question in this chapter
        results = []
        for q in chapter_questions:
            top_k = min(10, len(documents))
            retrieved = RAG.search_encoded_docs(query = q["question_text"].strip('"\''), k=top_k)
            results.append(retrieved)

        # Store results
        chapter_results[chapter] = results

    results = []
    for chapter, res in chapter_results.items():
        results.extend(res)

    assert len(results) == 191

    final_results = []
    for res in results:
        assert len(res) <= 10
        intermediate_results = [r['content'] for r in res]
        final_results.append(intermediate_results)

    assert len(final_results) == 191
    return final_results, documents

In [None]:
def calculate_mrr(question, retrieved_passages, cutoff=10):
    retrieved_passages = retrieved_passages[:cutoff]
    highest_rank = 0

    for ans_comp in question["answer_context"]:
        contexts = ans_comp.get("context", [])
        component_found = False

        for rank, passage in enumerate(retrieved_passages, start=1):
            if any(fix_text(context) in fix_text(passage) for context in contexts):
                highest_rank = max(highest_rank, rank)
                component_found = True
                break

        if not component_found:
            return 0.0

    return 1.0/highest_rank if highest_rank > 0 else 0.0

In [None]:
def calculate_recall(question, retrieved_passages, cutoff=10):
    retrieved_passages = retrieved_passages[:cutoff]

    # Track if we've found at least one context for each answer component
    ans_comp_found = []

    for ans_comp in question["answer_context"]:
        contexts = ans_comp.get("context", [])
        found = False

        # Check if any context for this answer component appears in retrieved passages
        for passage in retrieved_passages:
            if any(fix_text(context) in fix_text(passage) for context in contexts):
                found = True
                break

        ans_comp_found.append(found)

    # Recall is ratio of answer components with at least one found context
    return sum(ans_comp_found) / len(ans_comp_found)

In [None]:
def score_retrieval(results, benchmark):
    q_mrr = []
    q_recall = []

    for i, question in enumerate(benchmark["questions"]):
        mrr = calculate_mrr(question, results[i], cutoff=10)
        recall = calculate_recall(question, results[i], cutoff=10)
        q_mrr.append(mrr)
        q_recall.append(recall)

    assert len(q_mrr) == len(benchmark["questions"])
    assert len(q_recall) == len(benchmark["questions"])

    return q_mrr, q_recall

In [None]:
def save_results(results, df, q_mrr, q_recall, name):
    flat_results = ['\n\n'.join(res) for res in results]

    assert len(flat_results) == 191

    df[f'{name}_retrieval'] = flat_results
    df[f'{name}_mrr10'] = q_mrr
    df[f'{name}_recall10'] = q_recall

    return df

In [None]:
def do_retrieval(method, chunk_size, data, benchmark, questions=None, benchmark_results=None):
    if method == "bm25": results, docs = fts_retrieval(data, questions, chunk_size)
    if method == "single_vector": results, docs = single_vector_retrieval(data, benchmark, chunk_size)
    if method == "colbertv2": results, docs = index_free_retrieval(data=data, model_nm="colbert-ir/colbertv2.0", chunk_size=chunk_size, benchmark=benchmark)
    if method == "answerai_colbert": results, docs = index_free_retrieval(data=data, model_nm="answerdotai/answerai-colbert-small-v1", chunk_size=chunk_size, benchmark=benchmark)

    name = f"{method}_{chunk_size}"
    q_mrr, q_recall = score_retrieval(results, benchmark)
    #benchmark_results = save_results(results, benchmark_results, q_mrr, q_recall, name=name)
    #return pd.Series(q_mrr).mean(), pd.Series(q_recall).mean()

    return results, docs

In [None]:
def notebook_to_string(path):
    with open(path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    all_text = ''
    found_questionnaire = False

    for cell in notebook['cells']:
        if cell['cell_type'] == 'markdown' and any('## Questionnaire' in line for line in cell['source']):
            found_questionnaire = True
            break

        if cell['cell_type'] in ['markdown', 'code']:
            all_text += ''.join(cell['source']) + '\n'
    return all_text

In [None]:
def chunk_string(text, n):
    """Split text into n chunks."""
    skip = int(len(text) / n)
    return [text[i:i + skip] for i in range(0, len(text), skip)]

In [None]:
def clean_html(text):
    # Step 1: Temporarily replace double-bracketed content with a placeholder
    import uuid
    placeholder = f"PLACEHOLDER_{uuid.uuid4()}"
    double_bracketed = re.findall(r'<<[^>]*>>', text)
    step1 = re.sub(r'<<[^>]*>>', placeholder, text)

    # Step 2: Remove HTML tags
    step2 = re.sub(r'<[/]?[a-zA-Z][^>]*>', '', step1)

    # Step 3: Restore double-bracketed content
    if double_bracketed:
        step3 = step2.replace(placeholder, double_bracketed[0])
        return step3
    return step2

In [None]:
def remove_punctuation(text):
    import string
    return ''.join(char if char.isalnum() else ' ' if char in string.punctuation else char for char in text)

In [None]:
def process_contexts(data):
    # Process questions
    for question in data['questions']:
        # Process only answer_context
        if 'answer_context' in question:
            for context_item in question['answer_context']:
                if 'context' in context_item:
                    if isinstance(context_item['context'], list):
                        # If context is a list, process each string in the list
                        context_item['context'] = [
                            remove_punctuation(text) if text else text
                            for text in context_item['context']
                        ]
                    elif isinstance(context_item['context'], str):
                        # If context is a single string, process it directly
                        context_item['context'] = remove_punctuation(context_item['context'])

    return data

## Background

In this notebook, I'm running all retrieval methods so that I can manually check their MRR@10 and Recall@10 and compare with my functions. I'll start with performing retrieval on chapter text as is (with punctuation) so that it's easier to read.

In [None]:
data_no_pp = {}
n_chars = 0

for chapter, nb in nbs.items():
    data_no_pp[chapter] = chunk_string(notebook_to_string(nb), 2)
    for c in data_no_pp[chapter]:
        n_chars += len(c)

assert n_chars == 503769

In [None]:
benchmark = load_benchmark()
assert len(benchmark['questions']) == 191

processed_benchmark = process_contexts(benchmark)
assert len(processed_benchmark['questions']) == 191

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
benchmark_results = questions.copy()
benchmark_results.shape

(191, 5)

In [None]:
results = []
documents = []

for method in ["bm25", "single_vector", "colbertv2", "answerai_colbert"]:
  for chunk_size in [500]:
    res, docs = do_retrieval(
        method,
        chunk_size,
        data_no_pp,
        load_benchmark(),
        questions)

    results.append((method, chunk_size, res))
    documents.append((method, chunk_size, docs))

In [None]:
len(results) # four methods, 1 chunk size

4

In [None]:
len(results[0]) # method, chunk_size, res

3

In [None]:
results[3][0], results[3][1]

('answerai_colbert', 500)

In [None]:
benchmark_results = questions.copy()
benchmark_results.shape

(191, 5)

In [None]:
benchmark = load_benchmark()
for method, chunk_size, res in results:
    if chunk_size in [500]:
        q_mrr, q_recall = score_retrieval(res, benchmark)
        benchmark_results[f'{method}_{chunk_size}'] = ['\n\n================================================\n'.join(r) for r in res]
        benchmark_results[f'{method}_{chunk_size}_mrr10'] = q_mrr
        benchmark_results[f'{method}_{chunk_size}_recall10'] = q_recall

In [None]:
benchmark_results.to_csv("2024-12-13-fastbook-benchmark-results.csv", index=False)