# Topic Extraction Pipeline

## Importing Libraries

In [1]:
from operator import itemgetter
import fitz
import pandas as pd 
import re

## Loading Document PDF (DBC)

In [78]:
doc = fitz.open('Abu Dhabi International Building Code (ADIBC) (1).pdf')

In [83]:
page1 = doc[100]
words = page1.get_text("words")

In [84]:
dict1 = page1.get_text("dict")
print(dict1)

{'width': 595.44, 'height': 841.68, 'blocks': [{'number': 0, 'type': 0, 'bbox': (291.7699890136719, 776.6103515625, 306.6400146484375, 792.458251953125), 'lines': [{'spans': [{'size': 11.880000114440918, 'flags': 4, 'font': 'Times New Roman', 'color': 0, 'ascender': 1.0529999732971191, 'descender': -0.2809999883174896, 'text': '88 ', 'origin': (291.7699890136719, 789.1199951171875), 'bbox': (291.7699890136719, 776.6103515625, 306.6400146484375, 792.458251953125)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (291.7699890136719, 776.6103515625, 306.6400146484375, 792.458251953125)}]}, {'number': 1, 'type': 0, 'bbox': (72.0479965209961, 790.6503295898438, 75.01799774169922, 806.4982299804688), 'lines': [{'spans': [{'size': 11.880000114440918, 'flags': 4, 'font': 'Times New Roman', 'color': 0, 'ascender': 1.0529999732971191, 'descender': -0.2809999883174896, 'text': ' ', 'origin': (72.0479965209961, 803.1599731445312), 'bbox': (72.0479965209961, 790.6503295898438, 75.01799774169922, 806.498229

In [85]:
blocks = dict1["blocks"]

In [86]:
blocks

[{'number': 0,
  'type': 0,
  'bbox': (291.7699890136719,
   776.6103515625,
   306.6400146484375,
   792.458251953125),
  'lines': [{'spans': [{'size': 11.880000114440918,
      'flags': 4,
      'font': 'Times New Roman',
      'color': 0,
      'ascender': 1.0529999732971191,
      'descender': -0.2809999883174896,
      'text': '88 ',
      'origin': (291.7699890136719, 789.1199951171875),
      'bbox': (291.7699890136719,
       776.6103515625,
       306.6400146484375,
       792.458251953125)}],
    'wmode': 0,
    'dir': (1.0, 0.0),
    'bbox': (291.7699890136719,
     776.6103515625,
     306.6400146484375,
     792.458251953125)}]},
 {'number': 1,
  'type': 0,
  'bbox': (72.0479965209961,
   790.6503295898438,
   75.01799774169922,
   806.4982299804688),
  'lines': [{'spans': [{'size': 11.880000114440918,
      'flags': 4,
      'font': 'Times New Roman',
      'color': 0,
      'ascender': 1.0529999732971191,
      'descender': -0.2809999883174896,
      'text': ' ',
      '

In [92]:
l5 = []
for page in doc:
    blocks = page.get_text("dict")["blocks"]
    for b in blocks:  # iterate through the text blocks
        if b['type'] == 0:  # block contains text
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans
                    if s['size'] == 11.880000114440918 and s['font'] == 'Times New Roman,Bold':
                        l5.append(s['text'])

In [98]:
l5

[' ',
 'يلودلا دوكلا لامعلأ  دوكلاو هايملا تاكبشو ةيحصلا تاديدمتلا',
 'قئارحلل يلودلا',
 'CHAPTER 1 ',
 ' ',
 ' ',
 ' ',
 'PART 1-SCOPE AND APPLICATION',
 ' ',
 'SECTION 101 GENERAL',
 ' ',
 '101.1 Title.',
 '101.2 Scope.',
 '101.2.1 Appendices.',
 '101.3 Intent.',
 ' ',
 ' ',
 ' ',
 '101.4 Referenced codes.',
 ' ',
 '101.4.1 Gas.',
 '101.4.2 Mechanical.',
 '101.4.3 Plumbing.',
 '101.4.4 Property maintenance.',
 '101.4.5 Fire prevention.',
 '101.4.6 Energy.',
 ' ',
 '101.5 Required Signage.',
 'SECTION 102 APPLICABILITY',
 '102.1 General.',
 '102.2 Other laws.',
 '102.3 Application of references.',
 ' ',
 '102.4 Referenced codes and standards.',
 ' ',
 ' ',
 '102.5 Partial invalidity.',
 '102.6 Existing structures.',
 'PART 2-ADMINISTRATION AND ENFORCEMENT',
 'SECTION 103 BUILDING PERMITS DIRECTORATE.',
 '103.1 Creation of enforcement agency.',
 '103.2 Appointment.',
 '103.3 Deputies.',
 'SECTION 104 DUTIES AND POWERS OF BUILDING OFFICIAL. ',
 '104.1 General.',
 '104.2 Applications and

In [97]:
b

['402.5 Mall width.',
 '402.5.1 Minimum width.',
 '402.5.2 Minimum width open mall.']

## Identify paragraphs, headers, and subscripts

In [67]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                        'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [68]:
font_counts, styles = fonts(doc)

In [69]:
font_counts

[('11.880000114440918', 71350),
 ('10.079999923706055', 32196),
 ('6.480000019073486', 3815),
 ('7.920000076293945', 1678),
 ('9.0', 113),
 ('24.1200008392334', 103),
 ('11.15999984741211', 70),
 ('13.680000305175781', 60),
 ('2.880000114440918', 39),
 ('14.039999961853027', 25),
 ('3.9600000381469727', 23),
 ('9.359999656677246', 18),
 ('0.7200000286102295', 14),
 ('15.84000015258789', 4),
 ('6.840000152587891', 3),
 ('18.0', 2),
 ('6.119999885559082', 2),
 ('21.959999084472656', 1),
 ('20.15999984741211', 1),
 ('7.559999942779541', 1)]

In [70]:
styles

{'11.880000114440918': {'size': 11.880000114440918, 'font': 'Times New Roman'},
 '14.039999961853027': {'size': 14.039999961853027,
  'font': 'Times New Roman,Bold'},
 '11.15999984741211': {'size': 11.15999984741211, 'font': 'Times New Roman'},
 '18.0': {'size': 18.0, 'font': 'Arial,Bold'},
 '21.959999084472656': {'size': 21.959999084472656, 'font': 'Arial,Bold'},
 '20.15999984741211': {'size': 20.15999984741211, 'font': 'Arial'},
 '7.920000076293945': {'size': 7.920000076293945, 'font': 'Times New Roman'},
 '15.84000015258789': {'size': 15.84000015258789, 'font': 'Arial,Bold'},
 '6.119999885559082': {'size': 6.119999885559082, 'font': 'Arial'},
 '10.079999923706055': {'size': 10.079999923706055, 'font': 'Times New Roman'},
 '13.680000305175781': {'size': 13.680000305175781,
  'font': 'Times New Roman,Bold'},
 '6.480000019073486': {'size': 6.480000019073486, 'font': 'Times New Roman'},
 '2.880000114440918': {'size': 2.880000114440918, 'font': 'Times New Roman'},
 '9.359999656677246': {

## Creating Element Tag Dictionary

In [71]:

def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [72]:
size_tag = font_tags(font_counts, styles)
print(size_tag)

{24.1200008392334: '<h1>', 21.959999084472656: '<h2>', 20.15999984741211: '<h3>', 18.0: '<h4>', 15.84000015258789: '<h5>', 14.039999961853027: '<h6>', 13.680000305175781: '<h7>', 11.880000114440918: '<p>', 11.15999984741211: '<s1>', 10.079999923706055: '<s2>', 9.359999656677246: '<s3>', 9.0: '<s4>', 7.920000076293945: '<s5>', 7.559999942779541: '<s6>', 6.840000152587891: '<s7>', 6.480000019073486: '<s8>', 6.119999885559082: '<s9>', 3.9600000381469727: '<s10>', 2.880000114440918: '<s11>', 0.7200000286102295: '<s12>'}


## Extracting headers and paragraphs

In [87]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if s['size'] == 11.880000114440918 and s['font'] == 'Times New Roman,Bold':
                                if first:
                                    previous_s = s
                                    first = False
                                    block_string = size_tag[s['size']] + s['text']
                                else:
                                    if s['size'] == previous_s['size']:

                                        if block_string and all((c == "|") for c in block_string):
                                            # block_string only contains pipes
                                            block_string = size_tag[s['size']] + s['text']
                                        if block_string == "":
                                            # new block has started, so append size tag
                                            block_string = size_tag[s['size']] + s['text']
                                        else:  # in the same block, so concatenate strings
                                            block_string += " " + s['text']

                                    else:
                                        header_para.append(block_string)
    #                                        print(size_tag[s['size']])
                                        block_string = size_tag[s['size']] + s['text']

                                    previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [88]:
list1 = headers_para(doc, size_tag)

In [89]:
print(list1)



## Text Cleaning

In [49]:
subheadings = []
subheadings = [i for i in list1 if i.startswith('<p>')]



In [52]:
subheadings = [i[3:] for i in subheadings]

In [53]:
subheadings

['يلودلا دوكلا لامعلأ  دوكلاو هايملا تاكبشو ةيحصلا تاديدمتلا|',
 'قئارحلل يلودلا||',
 'CHAPTER 1 |',
 'PART 1-SCOPE AND APPLICATION||| SECTION 101 GENERAL||| 101.1 Title.|||||||',
 '101.2 Scope.||||||||',
 '101.2.1 Appendices.||||',
 '101.3 Intent.||||||||||||||||| 101.4 Referenced codes.|||||',
 '101.4.1 Gas.|||||||||| 101.4.2 Mechanical.||||||||||',
 '101.4.3 Plumbing. 101.4.3 Plumbing.|||||||||||||||||||||||',
 '101.4.4 Property maintenance.|||||||||',
 '101.4.5 Fire prevention.||||||||||||||||||||| 101.4.6 Energy.|||||||',
 '101.5 Required Signage. 101.5 Required Signage.|||||| SECTION 102 APPLICABILITY||',
 '102.1 General.||||||',
 '102.2 Other laws.|||',
 '102.3 Application of references.||||| 102.4 Referenced codes and standards.||||||||||||||||||||| 102.5 Partial invalidity.||||| 102.6 Existing structures.||||||||||||||',
 'PART 2-ADMINISTRATION AND ENFORCEMENT PART 2-ADMINISTRATION AND ENFORCEMENT||| SECTION 103 BUILDING PERMITS DIRECTORATE.|||| 103.1 Creation of enforcement a

In [None]:
r = re.compile("[a-zA-Z]\.([0-9]+(\.[0-9]+)+)\s+\|\s+[a-zA-Z]+(.*)")
subheadings = [i for i in subheadings if r.match(i)]
subheadings = [i[:-1] for i in subheadings]

In [13]:
list_of_i = []
list_of_h = []
for i in subheadings:
    w = i.split('|')
    list_of_i.append(w[0].strip())
    list_of_h.append(w[1].strip())

In [14]:
list_of_i

[]

In [15]:
list_of_h

[]

In [34]:
df = pd.DataFrame(list(zip(list_of_i, list_of_h)),
               columns =['Index', 'Heading'])

In [36]:
df.to_csv('Index_heading.csv') 