## Setup

In [None]:
!pip install llama-index -qq
!pip install -qq RAGatouille
!pip install ftfy -qq

import sqlite3
import json
import re
import os
import pandas as pd, numpy as np
import requests
from ftfy import fix_text
from ragatouille.data import CorpusProcessor
from llama_index.core.text_splitter import SentenceSplitter

corpus_processor = CorpusProcessor()

Make sure to download [utils.py](https://github.com/vishalbakshi/fastbook-benchmark/blob/main/examples/utils.py) and save it locally.

In [2]:
from utils import *

### Download Data

In [19]:
nbs = download_data()
nbs

{'1': '01_intro.ipynb',
 '2': '02_production.ipynb',
 '4': '04_mnist_basics.ipynb',
 '8': '08_collab.ipynb',
 '9': '09_tabular.ipynb',
 '10': '10_nlp.ipynb',
 '13': '13_convolutions.ipynb'}

In [20]:
data = get_data(nbs)

## Load Database

In [21]:
chunk_size = 500
db_path = 'fastbook.db'
chapter = '1'
chunk_size, db_path, chapter

(500, 'fastbook.db', '1')

In [25]:
# create virtual table
if not os.path.exists(db_path):
    with sqlite3.connect(db_path) as conn:
        cur = conn.cursor()
        cur.execute("""
                CREATE VIRTUAL TABLE fastbook_text
                USING FTS5(chapter, text);
                """)
        conn.commit()

In [26]:
documents = process_documents(data['1'], chunk_size=chunk_size)
assert len(documents) == 57

In [27]:
# load in the chunks for each chapter
with sqlite3.connect(db_path) as conn:
    cur = conn.cursor()
    for doc in documents: cur.execute("INSERT INTO fastbook_text(chapter, text) VALUES (?, ?)", (chapter, doc))
    conn.commit()
    res = cur.execute("SELECT * FROM fastbook_text WHERE chapter = ?", (chapter,)).fetchall()
    assert len(res) == len(documents)

In [28]:
res[0]

('1',

In [29]:
documents[0]



In [30]:
def delete_db():
    if os.path.exists("fastbook.db"): os.remove("fastbook.db")

In [31]:
def load_data(documents, db_path, chapter):
    # create virtual table
    if not os.path.exists(db_path):
        with sqlite3.connect(db_path) as conn:
            cur = conn.cursor()
            cur.execute("""
                    CREATE VIRTUAL TABLE fastbook_text
                    USING FTS5(chapter, text);
                    """)
            conn.commit()

    # load in the chunks for each chapter
    with sqlite3.connect(db_path) as conn:
        cur = conn.cursor()
        for doc in documents: cur.execute("INSERT INTO fastbook_text(chapter, text) VALUES (?, ?)", (chapter, doc))
        conn.commit()
        res = cur.execute("SELECT * FROM fastbook_text WHERE chapter = ?", (chapter,)).fetchall()
        assert len(res) == len(documents)

    return True

In [32]:
delete_db()
for chapter, text in data.items():
    documents = process_documents(text, chunk_size=chunk_size)
    assert load_data(documents, db_path, chapter)

## Full Text Search

### Load Keywords

In [33]:
url = 'https://raw.githubusercontent.com/vishalbakshi/fastbook-benchmark/refs/heads/main/examples/fts_keywords.csv'
kw_df = pd.read_csv(url)
assert kw_df.shape == (191, 4)
kw_df.head()

Unnamed: 0,chapter,question_number,question_text,keywords
0,1,1,"""Do you need these for deep learning?\n\n- Lot...","""deep learning, math, data, computers, PhD"""
1,1,2,"""Name five areas where deep learning is now th...","deep learning, areas, best, world"
2,1,3,"""What was the name of the first device that wa...","""neuron, neurons, device, artificial, principle"""
3,1,4,"""Based on the book of the same name, what are ...","""parallel, distributed, processing, PDP, requi..."
4,1,5,"""What were the two theoretical misunderstandin...","""neural, networks, theoretical, misunderstandi..."


### Prepare Match String

Goal:

> "deep" OR "learning" OR "math" OR "data" OR "computers" OR "PhD"

In [34]:
row = kw_df.iloc[0]
row

Unnamed: 0,0
chapter,1
question_number,1
question_text,"""Do you need these for deep learning?\n\n- Lot..."
keywords,"""deep learning, math, data, computers, PhD"""


In [35]:
chapter = row['chapter']
chapter

1

In [36]:
question_text = row['question_text']
question_text

'"Do you need these for deep learning?\\n\\n- Lots of math T / F\\n   - Lots of data T / F\\n   - Lots of expensive computers T / F\\n   - A PhD T / F"'

In [37]:
keywords = row['keywords']
keywords

'"deep learning, math, data, computers, PhD"'

In [38]:
keywords = keywords.replace('"', '').split()
keywords

['deep', 'learning,', 'math,', 'data,', 'computers,', 'PhD']

In [39]:
matchstr = ' OR '.join([f'"{kw.strip(",")}"' for kw in keywords])
matchstr

'"deep" OR "learning" OR "math" OR "data" OR "computers" OR "PhD"'

### Run Search

In [40]:
q = f"""
        SELECT text, rank
        FROM fastbook_text
        WHERE fastbook_text MATCH ?
        AND chapter = ?
        ORDER BY rank
        LIMIT ?
        """

In [41]:
limit = 10
limit

10

In [42]:
with sqlite3.connect(db_path) as conn:
    cur = conn.cursor()
    res = cur.execute(q, (matchstr, str(chapter), limit)).fetchall()
    res = [item[0] for item in res]

In [43]:
print(res[1])

#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
#hide
from fastbook import *
# Your Deep Learning Journey
Hello, and thank you for letting us join you on your deep learning journey, however far along that you may be! In this chapter, we will tell you a little bit more about what to expect in this book, introduce the key concepts behind deep learning, and train our first models on different tasks. It doesn't matter if you don't come from a technical or a mathematical background (though it's okay if you do too!); we wrote this book to make deep learning accessible to as many people as possible.
## Deep Learning Is for Everyone
A lot of people assume that you need all kinds of hard-to-find stuff to get great results with deep learning, but as you'll see in this book, those people are wrong. <<myths>> is a list of a few thing you *absolutely don't need* to do world-class deep learning.

```asciidoc
[[myths]]
.What you don't need to do deep learnin

In [44]:
def full_text_search(kw_df, limit=10):
    all_results = []
    with sqlite3.connect('fastbook.db') as conn:
        cur = conn.cursor()

        for _, row in kw_df.iterrows():
            chapter = row['chapter']
            keywords = row['keywords'].replace('"', '').split()
            matchstr = ' OR '.join([f'"{kw.strip(",")}"' for kw in keywords])

            q = f"""
                    SELECT text, rank
                    FROM fastbook_text
                    WHERE fastbook_text MATCH ?
                    AND chapter = ?
                    ORDER BY rank
                    LIMIT ?
            """
            res = cur.execute(q, (matchstr, str(chapter), limit)).fetchall()
            res = [item[0] for item in res]
            assert len(res) <= limit
            all_results.append(res)

    assert len(all_results) == len(kw_df)
    return all_results

In [45]:
chunk_size

500

In [46]:
delete_db()
for chapter, text in data.items():
    documents = process_documents(text, chunk_size=chunk_size)
    assert load_data(documents, db_path, chapter)

In [47]:
results = full_text_search(kw_df, limit=10)
assert len(results) == 191

In [48]:
print(results[0][1])

#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
#hide
from fastbook import *
# Your Deep Learning Journey
Hello, and thank you for letting us join you on your deep learning journey, however far along that you may be! In this chapter, we will tell you a little bit more about what to expect in this book, introduce the key concepts behind deep learning, and train our first models on different tasks. It doesn't matter if you don't come from a technical or a mathematical background (though it's okay if you do too!); we wrote this book to make deep learning accessible to as many people as possible.
## Deep Learning Is for Everyone
A lot of people assume that you need all kinds of hard-to-find stuff to get great results with deep learning, but as you'll see in this book, those people are wrong. <<myths>> is a list of a few thing you *absolutely don't need* to do world-class deep learning.

```asciidoc
[[myths]]
.What you don't need to do deep learnin