# Text Extraction from PDF using PyMuPDF

In [185]:
import fitz
import pandas as pd 
doc = fitz.open('Dubai Building Code_English_2021 Edition_compressed.pdf')
page1 = doc[3]
words = page1.get_text("words")

In [186]:
words[9]

(107.722900390625,
 78.3617172241211,
 196.03089904785156,
 108.74571228027344,
 'Introduction',
 2,
 1,
 0)

## Extracting Coordinates of the first object

In [187]:
def make_text(words):

    line_dict = {} 

    words.sort(key=lambda w: w[0])

    for w in words:  

        y1 = round(w[3], 1)  

        word = w[4] 

        line = line_dict.get(y1, [])  

        line.append(word)  

        line_dict[y1] = line  

    lines = list(line_dict.items())

    lines.sort()  

    return "n".join([" ".join(line[1]) for line in lines])

In [188]:
dict1 = page1.get_text("dict")
print(dict1)

{'width': 841.89, 'height': 595.276, 'blocks': [{'number': 0, 'type': 0, 'bbox': (56.692901611328125, 13.385185241699219, 785.1934204101562, 31.10918426513672), 'lines': [{'spans': [{'size': 10.5, 'flags': 4, 'font': 'Dubai-Regular', 'color': 16777215, 'ascender': 1.128999948501587, 'descender': -0.5590000152587891, 'text': 'Dubai Building Code', 'origin': (56.692901611328125, 25.23968505859375), 'bbox': (56.692901611328125, 13.385185241699219, 141.16539001464844, 31.10918426513672)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (56.692901611328125, 13.385185241699219, 141.16539001464844, 31.10918426513672)}, {'spans': [{'size': 10.5, 'flags': 4, 'font': 'Dubai-Regular', 'color': 16777215, 'ascender': 1.128999948501587, 'descender': -0.5590000152587891, 'text': 'Part A: General', 'origin': (721.6158447265625, 25.23968505859375), 'bbox': (721.6158447265625, 13.385185241699219, 785.1934204101562, 31.10918426513672)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (721.6158447265625, 13.3851852416992

In [189]:
blocks = dict1["blocks"]

In [190]:
print(blocks)

[{'number': 0, 'type': 0, 'bbox': (56.692901611328125, 13.385185241699219, 785.1934204101562, 31.10918426513672), 'lines': [{'spans': [{'size': 10.5, 'flags': 4, 'font': 'Dubai-Regular', 'color': 16777215, 'ascender': 1.128999948501587, 'descender': -0.5590000152587891, 'text': 'Dubai Building Code', 'origin': (56.692901611328125, 25.23968505859375), 'bbox': (56.692901611328125, 13.385185241699219, 141.16539001464844, 31.10918426513672)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (56.692901611328125, 13.385185241699219, 141.16539001464844, 31.10918426513672)}, {'spans': [{'size': 10.5, 'flags': 4, 'font': 'Dubai-Regular', 'color': 16777215, 'ascender': 1.128999948501587, 'descender': -0.5590000152587891, 'text': 'Part A: General', 'origin': (721.6158447265625, 25.23968505859375), 'bbox': (721.6158447265625, 13.385185241699219, 785.1934204101562, 31.10918426513672)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (721.6158447265625, 13.385185241699219, 785.1934204101562, 31.10918426513672)}]}, {

In [191]:
# styles = {}
# font_counts = {}

# for b in blocks:  # iterate through the text blocks
#             if b['type'] == 0:  # block contains text
#                 for l in b["lines"]:  # iterate through the text lines
#                     for s in l["spans"]:  # iterate through the text spans
#                         identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
#                         styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
#                                                   'color': s['color']
                                              
#                         print(font_counts.get(identifier)) # count the fonts usage

In [192]:
all_annots = []

In [193]:
for pageno in range(0,len(doc)-1):

    page = doc[pageno]

    words = page.get_text("words")

    for annot in page.annots():

        if annot!=None:

            rec=annot.rect

            mywords = [w for w in words if fitz.Rect(w[:4]) in rec]

            ann= make_text(mywords)

            all_annots.append(ann)


In [194]:
from operator import itemgetter

In [309]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['color'] != 3488054:
                            
                            if granularity:
                                identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                                styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                        'color': s['color']}
                            else:
                                identifier = "{0}".format(s['size'])
                                styles[identifier] = {'size': s['size'], 'font': s['font']}

                            font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [312]:
font_counts, styles = fonts(doc)

In [313]:
font_counts

[('10.5', 7050),
 ('8.5', 2467),
 ('8.0', 1569),
 ('13.0', 254),
 ('7.800000190734863', 243),
 ('9.779999732971191', 206),
 ('18.0', 203),
 ('9.0', 198),
 ('9.779500007629395', 161),
 ('4.95550012588501', 145),
 ('4.07480001449585', 83),
 ('8.884200096130371', 76),
 ('8.757599830627441', 70),
 ('8.639100074768066', 63),
 ('7.0', 59),
 ('6.061299800872803', 59),
 ('7.795300006866455', 57),
 ('14.0', 54),
 ('6.014599800109863', 44),
 ('6.0808000564575195', 44),
 ('6.879700183868408', 42),
 ('7.679200172424316', 42),
 ('5.807499885559082', 39),
 ('8.936400413513184', 39),
 ('8.285900115966797', 36),
 ('7.160900115966797', 36),
 ('6.121500015258789', 35),
 ('8.96679973602295', 33),
 ('7.124899864196777', 30),
 ('4.405200004577637', 29),
 ('9.820734024047852', 28),
 ('6.417399883270264', 28),
 ('9.325900077819824', 27),
 ('9.291000366210938', 25),
 ('5.5', 25),
 ('9.335399627685547', 24),
 ('8.788299560546875', 24),
 ('9.434599876403809', 23),
 ('8.235199928283691', 23),
 ('7.09000015258789

In [315]:
styles

{'48.0': {'size': 48.0, 'font': 'Dubai-Bold'},
 '12.0': {'size': 12.0, 'font': 'GothamOffice-Regular'},
 '10.5': {'size': 10.5, 'font': 'Dubai-Bold'},
 '18.0': {'size': 18.0, 'font': 'Dubai-Regular'},
 '13.0': {'size': 13.0, 'font': 'Dubai-Regular'},
 '14.0': {'size': 14.0, 'font': 'Wingdings2'},
 '36.0': {'size': 36.0, 'font': 'Dubai-Regular'},
 '8.5': {'size': 8.5, 'font': 'Dubai-Medium'},
 '8.0': {'size': 8.0, 'font': 'Dubai-Regular'},
 '7.795300006866455': {'size': 7.795300006866455,
  'font': 'GothamOffice-Regular'},
 '7.800000190734863': {'size': 7.800000190734863,
  'font': 'GothamOffice-Regular'},
 '6.854100227355957': {'size': 6.854100227355957,
  'font': 'GothamOffice-Regular'},
 '7.75439977645874': {'size': 7.75439977645874,
  'font': 'GothamOffice-Regular'},
 '9.722299575805664': {'size': 9.722299575805664,
  'font': 'GothamOffice-Regular'},
 '9.437000274658203': {'size': 9.437000274658203,
  'font': 'GothamOffice-Regular'},
 '7.526500225067139': {'size': 7.526500225067139,

In [316]:
p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
print(p_style['size'])

10.5


In [317]:

def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [318]:
size_tag = font_tags(font_counts, styles)

In [319]:
print(size_tag)

{48.0: '<h1>', 40.0: '<h2>', 36.0: '<h3>', 18.0: '<h4>', 14.777600288391113: '<h5>', 14.0: '<h6>', 13.894518852233887: '<h7>', 13.894499778747559: '<h8>', 13.894481658935547: '<h9>', 13.240099906921387: '<h10>', 13.0: '<h11>', 12.514846801757812: '<h12>', 12.225199699401855: '<h13>', 12.0: '<h14>', 11.938801765441895: '<h15>', 11.169095993041992: '<h16>', 11.16909122467041: '<h17>', 11.032400131225586: '<h18>', 10.994600296020508: '<h19>', 10.991999626159668: '<h20>', 10.979999542236328: '<h21>', 10.799099922180176: '<h22>', 10.642600059509277: '<h23>', 10.5615873336792: '<h24>', 10.561562538146973: '<h25>', 10.549200057983398: '<h26>', 10.5: '<p>', 10.182958602905273: '<s1>', 10.182930946350098: '<s2>', 10.139100074768066: '<s3>', 10.01729965209961: '<s4>', 10.0: '<s5>', 9.967100143432617: '<s6>', 9.921299934387207: '<s7>', 9.89109992980957: '<s8>', 9.834295272827148: '<s9>', 9.834272384643555: '<s10>', 9.834266662597656: '<s11>', 9.834251403808594: '<s12>', 9.83423137664795: '<s13>',

In [320]:
type(size_tag)

dict

In [321]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if s['size'] in size_tag:
                                if first:
                                    previous_s = s
                                    first = False
                                    block_string = size_tag[s['size']] + s['text']
                                else:
                                    if s['size'] == previous_s['size']:

                                        if block_string and all((c == "|") for c in block_string):
                                            # block_string only contains pipes
                                            block_string = size_tag[s['size']] + s['text']
                                        if block_string == "":
                                            # new block has started, so append size tag
                                            block_string = size_tag[s['size']] + s['text']
                                        else:  # in the same block, so concatenate strings
                                            block_string += " " + s['text']

                                    else:
                                        header_para.append(block_string)
    #                                        print(size_tag[s['size']])
                                        block_string = size_tag[s['size']] + s['text']

                                    previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [322]:
print(headers_para(doc, size_tag))



In [323]:
type(headers_para(doc, size_tag))

list

In [324]:
list1 = headers_para(doc, size_tag)

In [333]:
subheadings = []
subheadings = [i for i in list1 if i.startswith('<p>')]

In [334]:
subheadings

['<p>Dubai Building Code|',
 '<p>ii|',
 '<p>Dubai Building Code|',
 '<p>A 1|',
 '<p>Dubai Building Code| Part A: General|',
 '<p>A 2|',
 '<p>The objective of the Dubai Building Code (DBC) is to unify building design across | Dubai, and to create a building code that is easy to use and clearly mandates the | minimum requirements for:|',
 '<p>a) the health, safety, welfare and convenience of people in and around buildings; |',
 '<p>b) the health, safety, welfare and convenience of people who might be affected by | buildings; |',
 '<p>c) building design to reduce the impact on the surrounding environment; and|',
 '<p>d) the sustainable development of buildings.|',
 '<p>The content of the DBC is based on the following inputs:|',
 '<p>1) existing regulations and technical requirements produced by the various | Authorities and Service Providers;|',
 '<p>2) interviews with Government Authorities, Service Providers, Master Developers, | Consultants and other stakeholders to understand gaps or 

In [335]:
subheadings = [i[3:] for i in subheadings]

In [336]:
subheadings

['Dubai Building Code|',
 'ii|',
 'Dubai Building Code|',
 'A 1|',
 'Dubai Building Code| Part A: General|',
 'A 2|',
 'The objective of the Dubai Building Code (DBC) is to unify building design across | Dubai, and to create a building code that is easy to use and clearly mandates the | minimum requirements for:|',
 'a) the health, safety, welfare and convenience of people in and around buildings; |',
 'b) the health, safety, welfare and convenience of people who might be affected by | buildings; |',
 'c) building design to reduce the impact on the surrounding environment; and|',
 'd) the sustainable development of buildings.|',
 'The content of the DBC is based on the following inputs:|',
 '1) existing regulations and technical requirements produced by the various | Authorities and Service Providers;|',
 '2) interviews with Government Authorities, Service Providers, Master Developers, | Consultants and other stakeholders to understand gaps or inconsistencies | between existing regulat

In [337]:
import re
r = re.compile("[a-zA-Z]\.([0-9]+(\.[0-9]+)+)\s+\|\s+[a-zA-Z]+(.*)")
subheadings = [i for i in subheadings if r.match(i)]
subheadings = [i[:-1] for i in subheadings]

In [339]:
subheadings

['A.5.2.1 | Gross floor area (GFA)',
 'A.5.2.2 | Built-up area ',
 'A.5.2.3 | Gross area and net area',
 'A.5.2.4 | Building height',
 'B.4.2.1 | Gate level',
 'B.4.2.2 | Building height',
 'B.4.2.3 | Building setbacks ',
 'B.4.2.4 | Building areas ',
 'B.4.2.4.1  | Gross area (GA)',
 'B.4.2.4.2  | Net area (NA)',
 'B.4.2.5 | Balconies, building projections and terraces',
 'B.4.2.5.1  | Limitations',
 'B.4.2.5.2  | Guardrails for fall protection',
 'B.4.2.6 | Plot coverage',
 'B.4.2.5.3  | Access doors',
 'B.4.3.1 | Ground floor',
 'B.4.3.2 | Basements',
 'B.4.3.3 | Podium',
 'B.4.3.4 | Roof ',
 'B.6.2.1 | Minimum clear widths',
 'B.6.2.2 | Fire resistance rating',
 'B.6.2.3 | Kiosks in mall pedestrian ways',
 'B.6.4.1 | Stairways',
 'B.6.4.1.1  | General',
 'B.6.4.1.2  | Exit staircase construction',
 'B.6.4.1.3 | Stairway width',
 'B.6.4.1.4  | Stair risers and treads',
 'B.6.4.1.5  | Landings',
 'B.6.4.1.6  | Regular use and external stairs',
 'B.6.4.1.7  | Handrails',
 'B.6.4.1.8  

In [340]:
# l1 = ['F.6.3.1 | Design basis',
#  'F.6.3.2 | Additional design requirements for post-tensioned concrete',
#  'F.6.3.3 | Concrete',
#  'F.6.3.4 | Concrete mixes ']


In [341]:
# l2 = []
# l3 = []

In [342]:
# for i in l1:
#     w = i.split('|')
#     l2.append(w[0].strip())
#     l3.append(w[1].strip())

In [343]:
# l2

In [344]:
# l3

In [345]:
l1 = []
l2 = []
for i in subheadings:
    w = i.split('|')
    l1.append(w[0].strip())
    l2.append(w[1].strip())