In [1]:
import fitz
from anytree import Node, RenderTree
import re
import sqlite3

In [2]:
def extract_text(path):
    doc = fitz.open(path)
    text = ""
    for i in range(len(doc)):
        page = doc.load_page(i)
        text += page.get_text()
    return text

def extract_chapter(i):
    x = i.split('\n')
    return x[1].split(' ')[-1], x[-1]

def display_tree(root):
    for pre, _, node in RenderTree(root):
        print("%s%s" % (pre, node.name))


In [3]:
path = "data//UnderstandingDeepLearning.pdf"

pattern_chap = r'\nChapter \d+\n[^\n]*'
pattern_sec = r'\n\d+\.\d\n\D+\n(?![\d\.])'
pattern_seubsec = r'\n\d+\.\d+\.\d\n[^\n]*'
pattern_sec_subsec = re.compile(r'\n\d+\.\d+\.\d\n[^\n]*|\n\d+\.\d\n\D+\n(?![\d\.])')

matches_chap = []
matches_sec = []
matches_subsec = []

In [4]:
text = extract_text(path)

matches_chap.extend(re.findall(pattern_chap, text))
matches_chap = [extract_chapter(i) for i in matches_chap]

matches_sec.extend(re.findall(pattern_sec, text))
matches_sec = [(i.split('\n')[1],i.split('\n')[2]) for i in matches_sec]

matches_subsec.extend(re.findall(pattern_seubsec, text))
matches_subsec = [(i.split('\n')[1],i.split('\n')[2]) for i in matches_subsec]

In [5]:
def hierarchical_index():

    root = Node("Understanding Deep Learning")

    # Joining chapters to root
    for i in matches_chap:
        Node(i, parent=root, identifier=i[0], text=None)

    # Joining sections to chapters
    for i in matches_sec:
        for j in [(chap.name[0], chap) for chap in root.children]:
            if i[0].split('.')[0] == j[0]:
                Node(i, parent=j[1], identifier=i[0], text=None)

    # Joining subsections to sections
    for i in matches_subsec:
        all_sections = [i.children for i in root.children]
        all_sections = [node for sublist in all_sections for node in sublist]
        all_sections = [(sec.name[0],sec) for sec in all_sections]

        for j in all_sections:
            if '.'.join(i[0].split('.')[:-1]) == j[0]:
                Node(i, parent=j[1], identifier=i[0], text=None)

    # Adding text between sections and subsections to subsections node
    matches = list(re.finditer(pattern_sec_subsec, text))

    for i in range(len(matches) - 1):
        start = matches[i].end()
        end = matches[i + 1].start()
        subsection_num = matches[i].group().split('\n')[1]
        between_text = text[start:end]
        between_text = ' '.join(between_text.split('\n'))

        leaves = [(l.name[0],l) for l in root.leaves]
        for j in leaves:
            if subsection_num == j[0]:
                j[1].text = between_text
                # Node(between_text, parent=j[1])
    if matches:
        between_text = ' '.join(text[matches[-1].start():matches[-1].start()+1000].split('\n'))
        root.leaves[-1].text = between_text
        # Node(between_text,root.leaves[-1])
        
    return root

In [6]:
htree = hierarchical_index()

In [7]:
display_tree(htree)

Understanding Deep Learning
├── ('1', 'Introduction')
│   ├── ('1.1', 'Supervised learning')
│   │   ├── ('1.1.1', 'Regression and classification problems')
│   │   ├── ('1.1.2', 'Inputs')
│   │   ├── ('1.1.3', 'Machine learning models')
│   │   ├── ('1.1.4', 'Deep neural networks')
│   │   └── ('1.1.5', 'Structured outputs')
│   ├── ('1.2', 'Unsupervised learning')
│   │   ├── ('1.2.1', 'Generative models')
│   │   ├── ('1.2.2', 'Latent variables')
│   │   └── ('1.2.3', 'Connecting supervised and unsupervised learning')
│   ├── ('1.3', 'Reinforcement learning')
│   │   └── ('1.3.1', 'Two examples')
│   ├── ('1.4', 'Ethics')
│   ├── ('1.5', 'Structure of book')
│   ├── ('1.6', 'Other books')
│   └── ('1.7', 'How to read this book')
├── ('2', 'Supervised learning')
│   ├── ('2.1', 'Supervised learning overview')
│   ├── ('2.2', 'Linear regression example')
│   │   ├── ('2.2.1', '1D linear regression model')
│   │   ├── ('2.2.2', 'Loss')
│   │   ├── ('2.2.3', 'Training')
│   │   └── ('2.

In [8]:
# Create SQLite database and table for storing the hierarchical index
conn = sqlite3.connect('db//UnderstandingDeepLearning.db')
c = conn.cursor()

c.execute('''
CREATE TABLE IF NOT EXISTS understanding_deep_learning (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    parent_id INTEGER,
    identifier TEXT,
    title TEXT,
    text TEXT,
    FOREIGN KEY (parent_id) REFERENCES understanding_deep_learning (id)
)
''')

# Recursively insert nodes into the database
def tree_to_db(node, parent_id=None):
    identifier = getattr(node, 'identifier', None)
    title = node.name[1]
    text = getattr(node, 'text', None)
    c.execute('INSERT INTO understanding_deep_learning (parent_id, identifier, title, text) VALUES (?, ?, ?, ?)', 
              (parent_id, identifier, title, text))
    node_id = c.lastrowid
    for child in node.children:
        tree_to_db(child, node_id)

tree_to_db(htree)

conn.commit()
conn.close()