# Topic Extraction Pipeline

## Importing Libraries

In [2]:
from operator import itemgetter
import fitz
import pandas as pd 
import re

## Loading Document PDF (DBC)

In [3]:
doc = fitz.open('Dubai Building Code_English_2021 Edition_compressed.pdf')

## Identify paragraphs, headers, and subscripts

In [4]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict", flags = 20)["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['color'] != 20:
                            if granularity:
                                identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                                styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                        'color': s['color']}
                            else:
                                identifier = "{0}".format(s['size'])
                                styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [5]:
font_counts, styles = fonts(doc)

In [6]:
font_counts

[('10.5', 28686),
 ('8.5', 12054),
 ('8.0', 4586),
 ('12.0', 919),
 ('4.95550012588501', 284),
 ('6.121500015258789', 259),
 ('13.0', 254),
 ('7.800000190734863', 243),
 ('9.0', 210),
 ('9.779999732971191', 206),
 ('18.0', 203),
 ('9.779500007629395', 161),
 ('4.07480001449585', 83),
 ('8.884200096130371', 76),
 ('8.757599830627441', 70),
 ('7.0', 68),
 ('8.639100074768066', 63),
 ('6.826000213623047', 62),
 ('6.061299800872803', 59),
 ('7.795300006866455', 57),
 ('14.0', 54),
 ('6.014599800109863', 44),
 ('6.0808000564575195', 44),
 ('6.879700183868408', 42),
 ('7.679200172424316', 42),
 ('5.807499885559082', 39),
 ('8.936400413513184', 39),
 ('8.285900115966797', 36),
 ('7.160900115966797', 36),
 ('8.96679973602295', 33),
 ('7.124899864196777', 30),
 ('4.405200004577637', 29),
 ('9.820734024047852', 28),
 ('6.417399883270264', 28),
 ('9.325900077819824', 27),
 ('9.291000366210938', 25),
 ('5.5', 25),
 ('9.335399627685547', 24),
 ('8.788299560546875', 24),
 ('9.434599876403809', 23),


In [7]:
styles

{'48.0': {'size': 48.0, 'font': 'Dubai-Bold'},
 '12.0': {'size': 12.0, 'font': 'Dubai-Medium'},
 '10.5': {'size': 10.5, 'font': 'Dubai-Regular'},
 '18.0': {'size': 18.0, 'font': 'Dubai-Regular'},
 '13.0': {'size': 13.0, 'font': 'Dubai-Regular'},
 '14.0': {'size': 14.0, 'font': 'Wingdings2'},
 '36.0': {'size': 36.0, 'font': 'Dubai-Regular'},
 '8.5': {'size': 8.5, 'font': 'Dubai-Regular'},
 '8.0': {'size': 8.0, 'font': 'Dubai-Regular'},
 '7.795300006866455': {'size': 7.795300006866455,
  'font': 'GothamOffice-Regular'},
 '7.800000190734863': {'size': 7.800000190734863,
  'font': 'GothamOffice-Regular'},
 '4.95550012588501': {'size': 4.95550012588501, 'font': 'Dubai-Medium'},
 '6.854100227355957': {'size': 6.854100227355957,
  'font': 'GothamOffice-Regular'},
 '7.75439977645874': {'size': 7.75439977645874,
  'font': 'GothamOffice-Regular'},
 '9.722299575805664': {'size': 9.722299575805664,
  'font': 'GothamOffice-Regular'},
 '9.437000274658203': {'size': 9.437000274658203,
  'font': 'Goth

## Creating Element Tag Dictionary

In [8]:

def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [9]:
size_tag = font_tags(font_counts, styles)
print(size_tag)

{48.0: '<h1>', 40.0: '<h2>', 36.0: '<h3>', 18.0: '<h4>', 14.777600288391113: '<h5>', 14.0: '<h6>', 13.894518852233887: '<h7>', 13.894499778747559: '<h8>', 13.894481658935547: '<h9>', 13.240099906921387: '<h10>', 13.0: '<h11>', 12.514846801757812: '<h12>', 12.225199699401855: '<h13>', 12.0: '<h14>', 11.938801765441895: '<h15>', 11.169095993041992: '<h16>', 11.16909122467041: '<h17>', 11.032400131225586: '<h18>', 10.994600296020508: '<h19>', 10.991999626159668: '<h20>', 10.979999542236328: '<h21>', 10.799099922180176: '<h22>', 10.642600059509277: '<h23>', 10.5615873336792: '<h24>', 10.561562538146973: '<h25>', 10.549200057983398: '<h26>', 10.5: '<p>', 10.182958602905273: '<s1>', 10.182930946350098: '<s2>', 10.139100074768066: '<s3>', 10.01729965209961: '<s4>', 10.0: '<s5>', 9.967100143432617: '<s6>', 9.921299934387207: '<s7>', 9.89109992980957: '<s8>', 9.834295272827148: '<s9>', 9.834272384643555: '<s10>', 9.834266662597656: '<s11>', 9.834251403808594: '<s12>', 9.83423137664795: '<s13>',

## Extracting headers and paragraphs

In [10]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if s['size'] in size_tag and s['color'] != 3488054:
                                if first:
                                    previous_s = s
                                    first = False
                                    block_string = size_tag[s['size']] + s['text']
                                else:
                                    if s['size'] == previous_s['size']:

                                        if block_string and all((c == "|") for c in block_string):
                                            # block_string only contains pipes
                                            block_string = size_tag[s['size']] + s['text']
                                        if block_string == "":
                                            # new block has started, so append size tag
                                            block_string = size_tag[s['size']] + s['text']
                                        else:  # in the same block, so concatenate strings
                                            block_string += " " + s['text']

                                    else:
                                        header_para.append(block_string)
    #                                        print(size_tag[s['size']])
                                        block_string = size_tag[s['size']] + s['text']

                                    previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [11]:
list1 = headers_para(doc, size_tag)

## Text Cleaning

In [12]:
subheadings = []
subheadings = [i for i in list1 if i.startswith('<p>')]
subheadings = [i[3:] for i in subheadings]
r = re.compile("[a-zA-Z]\.([0-9]+(\.[0-9]+)+)\s+\|\s+[a-zA-Z]+(.*)")
subheadings = [i for i in subheadings if r.match(i)]
subheadings = [i[:-1] for i in subheadings]

In [13]:
list_of_i = []
list_of_h = []
for i in subheadings:
    w = i.split('|')
    list_of_i.append(w[0].strip())
    list_of_h.append(w[1].strip())

In [14]:
list_of_i

['A.5.2.1',
 'A.5.2.2',
 'A.5.2.3',
 'A.5.2.4',
 'B.4.2.1',
 'B.4.2.2',
 'B.4.2.3',
 'B.4.2.4',
 'B.4.2.4.1',
 'B.4.2.4.2',
 'B.4.2.5',
 'B.4.2.5.1',
 'B.4.2.5.2',
 'B.4.2.6',
 'B.4.2.5.3',
 'B.4.3.1',
 'B.4.3.2',
 'B.4.3.3',
 'B.4.3.4',
 'B.6.2.1',
 'B.6.2.2',
 'B.6.2.3',
 'B.6.4.1',
 'B.6.4.1.1',
 'B.6.4.1.2',
 'B.6.4.1.3',
 'B.6.4.1.4',
 'B.6.4.1.5',
 'B.6.4.1.6',
 'B.6.4.1.7',
 'B.6.4.1.8',
 'B.6.4.2',
 'B.6.4.3',
 'B.6.5.1',
 'B.6.5.2',
 'B.6.5.2.1',
 'B.6.5.2.2',
 'B.6.5.2.3',
 'B.6.5.2.4',
 'B.6.6.1',
 'B.6.6.2',
 'B.6.6.3',
 'B.6.6.4',
 'B.7.2.1',
 'B.7.2.2',
 'B.7.2.3',
 'B.7.2.3.1',
 'B.7.2.3.2',
 'B.7.2.3.3',
 'B.7.2.4',
 'B.7.2.5',
 'B.7.2.6',
 'B.7.2.6.1',
 'B.7.2.6.2',
 'B.7.2.6.3',
 'B.7.2.7',
 'B.7.3.1',
 'B.7.3.1.1',
 'B.7.3.1.2',
 'B.7.3.1.3',
 'B.7.3.1.4',
 'B.7.3.2',
 'B.7.3.3',
 'B.7.3.4',
 'B.8.1.1',
 'B.8.1.2',
 'B.8.1.3',
 'B.8.1.3.1',
 'B.8.1.3.2',
 'B.8.1.3.3',
 'B.8.1.4',
 'B.8.1.4.1',
 'B.8.1.4.2',
 'B.8.1.5',
 'B.8.1.6',
 'B.8.2.1',
 'B.8.2.2',
 'B.8.2.3',


In [15]:
list_of_h

['Gross floor area (GFA)',
 'Built-up area',
 'Gross area and net area',
 'Building height',
 'Gate level',
 'Building height',
 'Building setbacks',
 'Building areas',
 'Gross area (GA)',
 'Net area (NA)',
 'Balconies, building projections and terraces',
 'Limitations',
 'Guardrails for fall protection',
 'Plot coverage',
 'Access doors',
 'Ground floor',
 'Basements',
 'Podium',
 'Roof',
 'Minimum clear widths',
 'Fire resistance rating',
 'Kiosks in mall pedestrian ways',
 'Stairways',
 'General',
 'Exit staircase construction',
 'Stairway width',
 'Stair risers and treads',
 'Landings',
 'Regular use and external stairs',
 'Handrails',
 'Guardrails',
 'Ramps',
 'Elevators',
 'Doors',
 'Windows and daylighting',
 'Minimum area of windows',
 'Daylighting',
 'Access to views',
 'Safety of windows',
 'Convenience openings and communicating spaces',
 'Atria',
 'Courtyards',
 'Shafts',
 'General requirements',
 'Vehicular ramps',
 'Parking floors and structures',
 'General',
 'Open versu

In [16]:
# making data frame from list

df = pd.DataFrame(list(zip(list_of_i, list_of_h)),
               columns =['Index', 'Heading'])

In [19]:
df.head()

Unnamed: 0,Index,Heading
0,A.5.2.1,Gross floor area (GFA)
1,A.5.2.2,Built-up area
2,A.5.2.3,Gross area and net area
3,A.5.2.4,Building height
4,B.4.2.1,Gate level


In [20]:
# sorting by first name
df.sort_values("Heading", inplace = True)
 
# dropping ALL duplicate values
df.drop_duplicates(subset ="Heading",
                     keep = False, inplace = True)

In [23]:
df.to_csv('Index_heading.csv') 

In [24]:
df.head()

Unnamed: 0,Index,Heading
107,B.9.4.7,Ablution spaces
288,D.9.8.2,Acceleration and jerk
141,C.5.4.3,Access control barriers
14,B.4.2.5.3,Access doors
623,H.6.3.2,Access to drainage systems
