In [1]:
import fitz
from anytree import Node, RenderTree
import re

In [2]:
def extract_text(path):
    doc = fitz.open(path)
    text = ""
    for i in range(len(doc)):
        page = doc.load_page(i)
        text += page.get_text()
    return text

def extract_chapter(i):
    x = i.split('\n')
    return x[1].split(' ')[-1], x[-1]

def display_tree(root):
    for pre, _, node in RenderTree(root):
        print("%s%s" % (pre, node.name))


In [3]:
path = "data//UnderstandingDeepLearning.pdf"

pattern_chap = r'\nChapter \d+\n[^\n]*'
pattern_sec = r'\n\d+\.\d\n\D+\n(?![\d\.])'
pattern_seubsec = r'\n\d+\.\d+\.\d\n[^\n]*'
pattern_sec_subsec = re.compile(r'\n\d+\.\d+\.\d\n[^\n]*|\n\d+\.\d\n\D+\n(?![\d\.])')

matches_chap = []
matches_sec = []
matches_subsec = []

In [4]:
text = extract_text(path)

matches_chap.extend(re.findall(pattern_chap, text))
matches_chap = [extract_chapter(i) for i in matches_chap]

matches_sec.extend(re.findall(pattern_sec, text))
matches_sec = [(i.split('\n')[1],i.split('\n')[2]) for i in matches_sec]

matches_subsec.extend(re.findall(pattern_seubsec, text))
matches_subsec = [(i.split('\n')[1],i.split('\n')[2]) for i in matches_subsec]

In [5]:
def hierarchical_index():

    root = Node("Understanding Deep Learning")

    # Joining chapters to root
    for i in matches_chap:
        Node(i, parent=root)

    # Joining sections to chapters
    for i in matches_sec:
        for j in [(chap.name[0], chap) for chap in root.children]:
            if i[0].split('.')[0] == j[0]:
                Node(i, parent=j[1])

    # Joining subsections to sections
    for i in matches_subsec:
        all_sections = [i.children for i in root.children]
        all_sections = [node for sublist in all_sections for node in sublist]
        all_sections = [(sec.name[0],sec) for sec in all_sections]

        for j in all_sections:
            if '.'.join(i[0].split('.')[:-1]) == j[0]:
                Node(i, parent=j[1])

    # Joining text between sections and subsections
    matches = list(re.finditer(pattern_sec_subsec, text))

    for i in range(len(matches) - 1):
        start = matches[i].end()
        end = matches[i + 1].start()
        subsection_num = matches[i].group().split('\n')[1]
        between_text = text[start:end]
        between_text = ' '.join(between_text.split('\n'))

        leaves = [(l.name[0],l) for l in root.leaves]
        for j in leaves:
            if subsection_num == j[0]:
                Node(between_text, parent=j[1])
    if matches:
        Node(text[matches[-1].start():matches[-1].start()+1000],root.leaves[-1])
        
    return root

In [6]:
htree = hierarchical_index()

In [7]:
display_tree(htree)

Understanding Deep Learning
├── ('1', 'Introduction')
│   ├── ('1.1', 'Supervised learning')
│   │   ├── ('1.1.1', 'Regression and classification problems')
│   │   │   └──  Figure 1.2 depicts several regression and classification problems. In each case, there is a meaningful real-world input (a sentence, a sound file, an image, etc.), and this is encoded as a vector of numbers. This vector forms the model input. The model maps the input to an output vector which is then “translated” back to a meaningful real-world prediction. For now, we focus on the inputs and outputs and treat the model as a black box that ingests a vector of numbers and returns another vector of numbers. The model in figure 1.2a predicts the price of a house based on input characteristics such as the square footage and the number of bedrooms. This is a regression problem because the model returns a continuous number (rather than a category assignment). In contrast, the model in 1.2b takes the chemical structure of 