In [1]:
import fitz
from anytree import Node, RenderTree
import re
import sqlite3
import nltk
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from gensim.utils import simple_preprocess

In [2]:
def extract_text(path):
    doc = fitz.open(path)
    text = ""
    for i in range(len(doc)):
        page = doc.load_page(i)
        text += page.get_text()
    return text

def extract_chapter(i):
    x = i.split('\n')
    return x[1].split(' ')[-1], x[-1]

def display_tree(root):
    for pre, _, node in RenderTree(root):
        print("%s%s" % (pre, node.name))


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [4]:
path = "data//UnderstandingDeepLearning.pdf"

pattern_chap = r'\nChapter \d+\n[^\n]*'
pattern_sec = r'\n\d+\.\d\n\D+\n(?![\d\.])'
pattern_seubsec = r'\n\d+\.\d+\.\d\n[^\n]*'
pattern_sec_subsec = re.compile(r'\n\d+\.\d+\.\d\n[^\n]*|\n\d+\.\d\n\D+\n(?![\d\.])')

matches_chap = []
matches_sec = []
matches_subsec = []

In [5]:
text = extract_text(path)

matches_chap.extend(re.findall(pattern_chap, text))
matches_chap = [extract_chapter(i) for i in matches_chap]

matches_sec.extend(re.findall(pattern_sec, text))
matches_sec = [(i.split('\n')[1],i.split('\n')[2]) for i in matches_sec]

matches_subsec.extend(re.findall(pattern_seubsec, text))
matches_subsec = [(i.split('\n')[1],i.split('\n')[2]) for i in matches_subsec]

In [6]:
def hierarchical_index():

    root = Node("Understanding Deep Learning")

    # Joining chapters to root
    for i in matches_chap:
        Node(i[1], parent=root, identifier=i[0], text=None)

    # Joining sections to chapters
    for i in matches_sec:
        for j in [(chap.identifier, chap) for chap in root.children]:
            if i[0].split('.')[0] == j[0]:
                Node(i[1], parent=j[1], identifier=i[0], text=None)

    # Joining subsections to sections
    for i in matches_subsec:
        all_sections = [i.children for i in root.children]
        all_sections = [node for sublist in all_sections for node in sublist]
        all_sections = [(sec.identifier,sec) for sec in all_sections]

        for j in all_sections:
            if '.'.join(i[0].split('.')[:-1]) == j[0]:
                Node(i[1], parent=j[1], identifier=i[0], text=None)

    # Adding text between sections and subsections to subsections node
    matches = list(re.finditer(pattern_sec_subsec, text))

    for i in range(len(matches) - 1):
        start = matches[i].end()
        end = matches[i + 1].start()
        subsection_num = matches[i].group().split('\n')[1]
        between_text = text[start:end]
        between_text = ' '.join(between_text.split('\n'))

        leaves = [(l.identifier,l) for l in root.leaves]
        for j in leaves:
            if subsection_num == j[0]:
                j[1].text = between_text
    if matches:
        between_text = ' '.join(text[matches[-1].start():matches[-1].start()+1000].split('\n'))
        root.leaves[-1].text = between_text
        
    return root

In [7]:
htree = hierarchical_index()

In [None]:
display_tree(htree)

In [11]:
# Create SQLite database and table for storing the hierarchical index
conn = sqlite3.connect('db//UnderstandingDeepLearning.db')
c = conn.cursor()

c.execute('''
CREATE TABLE IF NOT EXISTS understanding_deep_learning (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    parent_id INTEGER,
    identifier TEXT,
    title TEXT,
    text TEXT,
    FOREIGN KEY (parent_id) REFERENCES understanding_deep_learning (id)
)
''')

# Recursively insert nodes into the database
def tree_to_db(node, parent_id=None):
    identifier = getattr(node, 'identifier', None)
    title = node.name
    text = getattr(node, 'text', None)
    c.execute('INSERT INTO understanding_deep_learning (parent_id, identifier, title, text) VALUES (?, ?, ?, ?)', 
              (parent_id, identifier, title, text))
    node_id = c.lastrowid
    for child in node.children:
        tree_to_db(child, node_id)

tree_to_db(htree)

conn.commit()
conn.close()

In [9]:
def expand_query(query):
    words = simple_preprocess(query)
    expanded_query = set(words)
    stemmer = PorterStemmer()

    for word in words:
        # Add synonyms
        synonyms = wordnet.synsets(word)
        for syn in synonyms:
            for lemma in syn.lemmas():
                expanded_query.add(lemma.name())
        
        # Add stemmed words
        expanded_query.add(stemmer.stem(word))

    return list(expanded_query)

In [10]:
query = "deep learning"
expanded_query = expand_query(query)
print(expanded_query)

['deep', 'get_a_line', 'cryptic', 'mystifying', 'acquisition', 'cryptical', 'acquire', 'rich', 'ascertain', 'learn', 'hear', 'memorise', 'teach', 'bass', 'encyclopaedism', 'scholarship', 'larn', 'trench', 'get_wind', 'mysterious', 'check', 'encyclopedism', 'find_out', 'abstruse', 'study', 'see', 'instruct', 'con', 'get_word', 'inscrutable', 'discover', 'thick', 'late', 'erudition', 'watch', 'learning', 'eruditeness', 'pick_up', 'read', 'recondite', 'memorize', 'take', 'determine', 'learnedness', 'oceanic_abyss', 'deeply']
