## Setup

In [None]:
!pip install ftfy -qq
!pip install llama-index -qq
!pip install -qq RAGatouille

import sqlite3
import json
import re
import os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn.functional as F
from ftfy import fix_text
from ragatouille.data import CorpusProcessor
from llama_index.core.text_splitter import SentenceSplitter

corpus_processor = CorpusProcessor()

### Download Data

In [2]:
def download_file(url, fn):
    with open(fn, 'wb') as file: file.write(requests.get(url).content)

urls = {
    '1':  'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/data/01_intro.ipynb',
    '2':  'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/data/02_production.ipynb',
    '4':  'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/data/04_mnist_basics.ipynb',
    '8':  'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/data/08_collab.ipynb',
    '9':  'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/data/09_tabular.ipynb',
    '10': 'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/data/10_nlp.ipynb',
    '13': 'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/data/13_convolutions.ipynb'
}

nbs = {
    '1': '01_intro.ipynb',
    '2': '02_production.ipynb',
    '4': '04_mnist_basics.ipynb',
    '8': '08_collab.ipynb',
    '9': '09_tabular.ipynb',
    '10': '10_nlp.ipynb',
    '13': '13_convolutions.ipynb'
}

for chapter, nb in nbs.items(): download_file(urls[chapter], fn=nb)

### Helper Functions

In [4]:
def chunk_string(text, n_chunks):
    skip = int(len(text) / n_chunks)
    return [text[i:i + skip] for i in range(0, len(text), skip)]

In [5]:
def notebook_to_string(path):
    with open(path, 'r', encoding='utf-8') as f: notebook = json.load(f)

    all_text = ''

    for cell in notebook['cells']:
        if cell['cell_type'] == 'markdown' and any('## Questionnaire' in line for line in cell['source']): break
        if cell['cell_type'] in ['markdown', 'code']: all_text += ''.join(cell['source']) + '\n'

    return all_text

## Converting Notebook to Strings

In [6]:
data = {}
n_chars = 0

for chapter, nb in nbs.items():
    data[chapter] = chunk_string(notebook_to_string(nb), 2)
    for c in data[chapter]: n_chars += len(c)

assert n_chars == 503769

In [7]:
for chapter, nb in nbs.items():
    for c in data[chapter]: print(chapter, len(c))

1 55418
1 55418
2 37366
2 37366
2 1
4 45540
4 45540
4 1
8 20067
8 20067
8 1
9 44438
9 44438
10 22143
10 22143
13 26911
13 26911


## Chunking the Documents

In [8]:
chunk_size = 500
chunk_size

500

In [9]:
corpus_processor.process_corpus

In [10]:
documents = corpus_processor.process_corpus(data['1'], chunk_size=chunk_size)
assert len(documents) == 57

In [11]:
documents[0]

{'document_id': 'b031a720-b21a-4cd5-8c55-a8d99e50edcf',

In [13]:
def process_documents(text, chunk_size):
    documents = corpus_processor.process_corpus(text, chunk_size=chunk_size)
    documents = [doc['content'] for doc in documents]
    return documents

In [15]:
assert len(process_documents(data['1'], chunk_size)) == 57

### Checking Token Size

In [16]:
SentenceSplitter

In [17]:
chunk_overlap = min(chunk_size /4, min(chunk_size/2, 64))
chunk_overlap

64

In [18]:
node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
node_parser._token_size

In [19]:
doc = documents[0]
doc

{'document_id': 'b031a720-b21a-4cd5-8c55-a8d99e50edcf',

In [21]:
node_parser._token_size(doc['content'])

475

In [22]:
documents = process_documents(data['1'], chunk_size)
len(documents)

57

In [23]:
toks = []
for doc in documents: toks.append(node_parser._token_size(doc))

In [24]:
toks = pd.Series(toks)
toks.describe()

Unnamed: 0,0
count,57.0
mean,456.298246
std,63.525013
min,92.0
25%,457.0
50%,471.0
75%,482.0
max,495.0


### Chunking All Documents

In [25]:
def get_docs(data, chunk_size):
    all_docs = []
    for chapter, text in data.items():
        documents = process_documents(text, chunk_size=chunk_size)
        all_docs.extend(documents)
    return all_docs

In [27]:
all_docs = get_docs(data, 500)
assert len(all_docs) == 271

In [28]:
def tok_dist(data, chunk_size):
    toks = []
    documents = get_docs(data, chunk_size)
    for doc in documents: toks.append(node_parser._token_size(doc))
    toks = pd.Series(toks)
    return toks

In [29]:
tok_dist(data, chunk_size=100).describe()

Unnamed: 0,0
count,1558.0
mean,80.683569
std,15.52707
min,11.0
25%,73.0
50%,84.0
75%,92.0
max,121.0


In [30]:
tok_dist(data, chunk_size=256).describe()

Unnamed: 0,0
count,602.0
mean,223.538206
std,31.262019
min,43.0
25%,218.0
50%,233.0
75%,243.0
max,299.0
