# PDF Text Extraction

In [1]:
import pandas as pd
import json
import re
from pandas.io.json import json_normalize
from os import listdir

## Create JSON File with raw text content

### Convert PDF to JSON using Cloud Vision API

In [2]:
%%bash

gcloud ml vision detect-text-pdf "gs://pdf-text-extraction/1/40df331_0.pdf" gs://pdf-text-extraction/1/
    
    

{
  "name": "projects/springboard-aic/operations/3217b91bdcb794a1"
}


### Copy JSON to local drive

In [3]:
# Check if documents exist
!gsutil ls -r gs://pdf-text-extraction/1/**

gs://pdf-text-extraction/1/
gs://pdf-text-extraction/1/40df331_0.pdf
gs://pdf-text-extraction/1/output-1-to-14.json


In [4]:
!gsutil cp -r gs://pdf-text-extraction/1/* ../files

Copying gs://pdf-text-extraction/1/40df331_0.pdf...
Copying gs://pdf-text-extraction/1/output-1-to-14.json...
/ [2 files][  3.8 MiB/  3.8 MiB]                                                
Operation completed over 2 objects/3.8 MiB.                                      


### Load JSON Object into memory

In [5]:
files = listdir('../files')
json_files = list(filter(lambda x: x.endswith(".json"), files))
json_files.sort()
pdf_files = list(filter(lambda x: x.endswith(".pdf"), files))[0]

In [6]:
pdf_content_json = json.load(open('../files/'+json_files[0]))

## Processing JSON

### Dimensions of pages in document

In [7]:
# How many pages?
page_count = len(pdf_content_json['responses'])

In [8]:
page_count

14

In [9]:
# Dimensions of each page?
cols = ['height', 'width', 'page']
data = [] 

for page in range(page_count):
    path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages']
    vals = json_normalize(path)[['height', 'width']].values.tolist()[0]
    vals.append(page)
    data.append(dict(zip(cols, vals)))
    
page_dimensions_df = pd.DataFrame(data)
page_dimensions_df

Unnamed: 0,height,page,width
0,792,0,612
1,792,1,612
2,792,2,612
3,792,3,612
4,792,4,612
5,792,5,612
6,792,6,612
7,792,7,612
8,792,8,612
9,792,9,612


## Explora Pages for text elements

Data Structure:

Page

 |-- Block
 
 |--- Paragraph
 
 |---- Word
 
 |----- Symbol
 

### How many blocks per page?

In [10]:
# blocks per page
data = []
cols = ['block_count', 'page']

for page in range(page_count):
    path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks']
    vals = [len(path)]
    vals.append(page)
    data.append(dict(zip(cols, vals)))
    
block_counts_df = pd.DataFrame(data)
block_counts_df

Unnamed: 0,block_count,page
0,8,0
1,13,1
2,12,2
3,36,3
4,25,4
5,23,5
6,17,6
7,16,7
8,10,8
9,13,9


### How many paragraphs per block per page?

In [11]:
# paragraphs per block per page
data = []
cols = ['paragraph_count','block_count', 'page']

for page in range(page_count):
    for block in range(block_counts_df.query('page == ' + str(page))['block_count'].values[0]):
        path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block]['paragraphs']
        vals = [len(path)]
        vals.append(block)
        vals.append(page)
        data.append(dict(zip(cols, vals)))
    
paragraph_counts_df = pd.DataFrame(data)
paragraph_counts_df

Unnamed: 0,block_count,page,paragraph_count
0,0,0,1
1,1,0,1
2,2,0,1
3,3,0,1
4,4,0,1
5,5,0,1
6,6,0,1
7,7,0,1
8,0,1,1
9,1,1,1


### How many words per paragraph per block per page?

In [12]:
# word per paragraph
data = []
cols = ['word_count', 'paragraph','block', 'page']

for page in range(page_count):
    for block in range(block_counts_df.loc[page,'block_count']):
        for paragraph in range(paragraph_counts_df.query( ('page == ' + str(page) + (' & block_count == ' + str(block))))['paragraph_count'].values[0]):
            path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block]['paragraphs'][paragraph]['words']
            vals = [len(path)]
            vals.append(paragraph)
            vals.append(block)
            vals.append(page)
            data.append(dict(zip(cols, vals)))
    
word_counts_df = pd.DataFrame(data)
word_counts_df

Unnamed: 0,block,page,paragraph,word_count
0,0,0,0,1
1,1,0,0,2
2,2,0,0,3
3,3,0,0,7
4,4,0,0,10
5,5,0,0,5
6,6,0,0,3
7,7,0,0,11
8,0,1,0,1
9,1,1,0,76


## Properties of text blocks

### Get confidence and coordiantes of text block elements

In [13]:
# Example for one block on one page:
page = 0
block = 1
path_normalized = json_normalize(pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block], ['boundingBox', 'normalizedVertices'], meta = 'confidence')
path_normalized

Unnamed: 0,x,y,confidence
0,0.254902,0.054293,0.56
1,0.392157,0.066919,0.56
2,0.382353,0.130051,0.56
3,0.245098,0.117424,0.56


In [14]:
# Example for one block on one page (flattened):

path = json_normalize(pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block], ['boundingBox', 'normalizedVertices'], meta = 'confidence')

confidence = path['confidence'].drop_duplicates().item()
data = path.reset_index(drop = True).drop('confidence', axis = 1)

data['position'] = data.index+1
data = data.melt(id_vars=['position'])
data['position'] = data['variable'] + data['position'].map(str) + '-block'

data['block'] = block

data = data.pivot(index = 'block', columns = 'position', values = 'value')
data['confidence-block'] = confidence

data

position,x1-block,x2-block,x3-block,x4-block,y1-block,y2-block,y3-block,y4-block,confidence-block
block,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.254902,0.392157,0.382353,0.245098,0.054293,0.066919,0.130051,0.117424,0.56


In [15]:
# Iterate over all blocks in document to add coordinates and confidence:
blocks_on_pages_df = []
cols = ['x1-block', 'x2-block', 'x3-block', 'x4-block','y1-block', 'y2-block','y3-block', 'y4-block', 'confidence', 'block', 'page']

for page in range(page_count):
    for block in range(block_counts_df.loc[page,'block_count']):
        path = json_normalize(pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block], ['boundingBox', 'normalizedVertices'], meta = 'confidence')
        confidence = path['confidence'].drop_duplicates().item()
        data = path.reset_index(drop = True).drop('confidence', axis = 1)
        data['position'] = data.index
        data = data.melt(id_vars=['position'])
        data['position'] = data['variable'] + data['position'].map(str) + '-block'
        data['block'] = block
        data = data.pivot(index = 'block', columns = 'position', values = 'value')
        data['confidence-block'] = confidence

        vals = data.values.tolist()[0]
        vals.append(block)
        vals.append(page)
        blocks_on_pages_df.append(dict(zip(cols, vals)))

blocks_on_pages_df = pd.DataFrame(blocks_on_pages_df)
blocks_on_pages_df

Unnamed: 0,block,confidence,page,x1-block,x2-block,x3-block,x4-block,y1-block,y2-block,y3-block,y4-block
0,0,0.96,0,0.338235,0.343137,0.343137,0.338235,0.079545,0.079545,0.084596,0.084596
1,1,0.56,0,0.254902,0.392157,0.382353,0.245098,0.054293,0.066919,0.130051,0.117424
2,2,0.99,0,0.361111,0.506536,0.506536,0.361111,0.090909,0.089646,0.099747,0.101010
3,3,0.99,0,0.235294,0.771242,0.771242,0.235294,0.224747,0.224747,0.273990,0.273990
4,4,0.96,0,0.227124,0.834967,0.834967,0.227124,0.304293,0.304293,0.571970,0.571970
5,5,0.99,0,0.235294,0.606209,0.606209,0.235294,0.910354,0.910354,0.930556,0.930556
6,6,0.99,0,0.235294,0.338235,0.338235,0.235294,0.943182,0.943182,0.953283,0.953283
7,7,0.99,0,0.357843,0.696078,0.696078,0.357843,0.943182,0.943182,0.953283,0.953283
8,0,0.92,1,0.163399,0.173203,0.173203,0.163399,0.079545,0.079545,0.087121,0.087121
9,1,0.99,1,0.233660,0.890523,0.890523,0.233660,0.098485,0.098485,0.188131,0.188131


### Extract Words from paragraphs

In [16]:
def text_from_paragraph(page, block, paragraph, word_count):
    "Concatenates all text symbols to word elements to a text within a given paragraph"
    
    paragraph_text = []
    
    for word in range(word_count):
        path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block]['paragraphs'][paragraph]['words'][word]
        word_str = json_normalize(path, 'symbols')['text'].str.cat()
        paragraph_text.append(word_str)
    
    paragraph_text = ' '.join(paragraph_text)
    paragraph_text = re.sub(" \\. ", ". ", paragraph_text) # Fix me! Use Regex instead
    paragraph_text = re.sub(" \\.", ".", paragraph_text)
    paragraph_text = re.sub(" , ", ", ", paragraph_text)
    paragraph_text = re.sub(" ! ", "! ", paragraph_text)
    paragraph_text = re.sub(" \\? ", "? ", paragraph_text)
    paragraph_text = re.sub(" ' ", "`", paragraph_text)
    paragraph_text = re.sub(" - ", "-", paragraph_text)
    paragraph_text = re.sub(" \\( ", " (", paragraph_text)
    paragraph_text = re.sub(" \\) ", ") ", paragraph_text)
    paragraph_text = re.sub("\\.", ". ", paragraph_text)
    paragraph_text = re.sub("\\.  ", ". ", paragraph_text)

    
    return(paragraph_text)


In [17]:
# extract text from each paragraph
data = []
cols = ['text', 'paragraph','block', 'page']

for page in range(page_count):
    for block in range(block_counts_df.loc[page,'block_count']):
        for paragraph in range(paragraph_counts_df.query( ('page == ' + str(page) + (' & block_count == ' + str(block))))['paragraph_count'].values[0]):
                word_count = word_counts_df.query( ('page == ' + str(page) + (' & block == ' + str(block) + (' & paragraph == ' + str(paragraph)))))['word_count'].values[0]
                vals = [text_from_paragraph(page, block, paragraph, word_count)]
                vals.append(paragraph)
                vals.append(block)
                vals.append(page)
                data.append(dict(zip(cols, vals)))
    
text_df = pd.DataFrame(data)


In [18]:
text_df

Unnamed: 0,block,page,paragraph,text
0,0,0,0,TM
1,1,0,0,ist -
2,2,0,0,imagine your future
3,3,0,0,BUILDING THE DIGITAL FACTORY VALUE CHAIN :
4,4,0,0,How to Maximize the Value of IIoT and IoT Stra...
5,5,0,0,Mike Harmon and Jim Routzong
6,6,0,0,ISG WHITE PAPER
7,7,0,0,"© 2018 Information Services Group, Inc. All Ri..."
8,0,1,0,TM
9,1,1,0,"Over the past half-decade, companies in manufa..."


### Join everything together

In [19]:
document_text = pd.merge(text_df, blocks_on_pages_df, on = ['page', 'block'])
document_text = pd.merge(document_text, page_dimensions_df, on = 'page')

## Feature Engineering

In [20]:
# Spacing on page on which no content appears generally
def is_content_area(x1, x2, x3, x4, y1, y2, y3, y4):
    if ( (x1 > 0.05) & (x4 > 0.05) & (y1 > 0.05) & (y4 < 0.9) ):
        return(True)
    else:
        return(False)
    

In [21]:
document_text['block_height'] = round( (document_text['y4-block']*document_text['height']) - (document_text['y1-block']*document_text['height']), 2 )
document_text['block_width'] = round( (document_text['x2-block']*document_text['width']) - (document_text['x1-block']*document_text['width']), 2 )
document_text['block_area'] = (document_text['block_width'] * document_text['block_height']) / (document_text['width'] * document_text['height'])*100

document_text['text_length'] = document_text.apply( lambda row: len(row['text']), axis = 1)

document_text['font_size'] = document_text['block_area'] / document_text['text_length']

document_text['block_is_content'] = document_text.apply( lambda row: is_content_area(*row[['x1-block', 'x2-block', 'x3-block', 'x4-block', 'y1-block', 'y2-block', 'y3-block', 'y4-block']]), axis = 1)

In [22]:
document_text.head(10)

Unnamed: 0,block,page,paragraph,text,confidence,x1-block,x2-block,x3-block,x4-block,y1-block,...,y3-block,y4-block,height,width,block_height,block_width,block_area,text_length,font_size,block_is_content
0,0,0,0,TM,0.96,0.338235,0.343137,0.343137,0.338235,0.079545,...,0.084596,0.084596,792,612,4.0,3.0,0.002476,2,0.001238,True
1,1,0,0,ist -,0.56,0.254902,0.392157,0.382353,0.245098,0.054293,...,0.130051,0.117424,792,612,50.0,84.0,0.866508,5,0.173302,True
2,2,0,0,imagine your future,0.99,0.361111,0.506536,0.506536,0.361111,0.090909,...,0.099747,0.10101,792,612,8.0,89.0,0.146894,19,0.007731,True
3,3,0,0,BUILDING THE DIGITAL FACTORY VALUE CHAIN :,0.99,0.235294,0.771242,0.771242,0.235294,0.224747,...,0.27399,0.27399,792,612,39.0,328.0,2.639136,42,0.062837,True
4,4,0,0,How to Maximize the Value of IIoT and IoT Stra...,0.96,0.227124,0.834967,0.834967,0.227124,0.304293,...,0.57197,0.57197,792,612,212.0,372.0,16.270549,50,0.325411,True
5,5,0,0,Mike Harmon and Jim Routzong,0.99,0.235294,0.606209,0.606209,0.235294,0.910354,...,0.930556,0.930556,792,612,16.0,227.0,0.749323,28,0.026762,False
6,6,0,0,ISG WHITE PAPER,0.99,0.235294,0.338235,0.338235,0.235294,0.943182,...,0.953283,0.953283,792,612,8.0,63.0,0.103981,15,0.006932,False
7,7,0,0,"© 2018 Information Services Group, Inc. All Ri...",0.99,0.357843,0.696078,0.696078,0.357843,0.943182,...,0.953283,0.953283,792,612,8.0,207.0,0.341652,59,0.005791,False
8,0,1,0,TM,0.92,0.163399,0.173203,0.173203,0.163399,0.079545,...,0.087121,0.087121,792,612,6.0,6.0,0.007427,2,0.003714,True
9,1,1,0,"Over the past half-decade, companies in manufa...",0.99,0.23366,0.890523,0.890523,0.23366,0.098485,...,0.188131,0.188131,792,612,71.0,402.0,5.888542,458,0.012857,True


### Classify pages

Classify pages into categories: Title, content, disclaimer, table of contents

Aggregate all text paragraphs on each page and analyze them individually

In [23]:
document_pages_df = document_text.groupby(['page']).agg({
    'block': 'max',
    'text_length': 'sum',
    'text': lambda x: x.str.cat(sep=' ')})

Text length in %

In [24]:
document_pages_df['page_text_length_perc']= document_pages_df['text_length']/document_pages_df['text_length'].sum()

#### Fixed rules: First page is title. Pages with <1% text content are image.

In [25]:
document_pages_df['page_class'] = ""

In [26]:
document_pages_df.loc[document_pages_df.page_text_length_perc < 0.01, 'page_class'] = 'empty'

In [27]:
document_pages_df.loc[0, 'page_class'] = 'title'

#### Get Topics per Page

Classify text on each page to induce content based on text categories.

In [28]:
document_pages_df.query('page == 2')

Unnamed: 0_level_0,text_length,block,text,page_text_length_perc,page_class
page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,3012,11,"TM iSG The loT is a "" system of systems ” in w...",0.103003,


In [29]:
import textrazor
textrazor.api_key = "194b1e4062b21e786b298b311f5b94e21b529c14bcca8d3eabc7c8c4"

client = textrazor.TextRazor(extractors=["topics"])
page_topics = []

for page in document_pages_df.index:
    text = document_pages_df.loc[page, 'text']
    response = client.analyze(text)
    page_topics.append(json_normalize(response.json['response'], 'topics')[:15])
    #response = client.analyze(document_pages_df.loc[page, 'text'])

In [30]:
# Loop through all topics per page and find and summarize law-labels

legal_pages = []
cols = ['page', 'score']

for page in document_pages_df.index:
    vals = []
    vals.append(page)
    vals.append(page_topics[page].query("label == 'Private law' | label == 'Common law'")['score'].sum())
    
    legal_pages.append(dict(zip(cols, vals)))

legal_pages = pd.DataFrame(legal_pages).query('score > 0')['page']


In [31]:
document_pages_df.loc[legal_pages, 'page_class'] = 'legal'

In [32]:
# Identify Page with ToC
toc_page = document_text[['text', 'page']].copy()
toc_page['text'] = toc_page['text'].str.lower()
toc_page = toc_page.query("text == 'table of contents' | text == 'table of content' | text == 'inhalt' | text == 'inhaltsverzeichnis' | text == 'content'")['page']

In [33]:
document_pages_df.loc[toc_page, 'page_class'] = 'toc'

In [34]:
# mark everything else as content
document_pages_df.loc[document_pages_df.page_class == "", 'page_class'] = "content"
document_pages_df = document_pages_df.reset_index()
document_pages_df = document_pages_df[['page_class', 'page', 'page_text_length_perc']]
document_text = document_text.merge(document_pages_df, on = 'page')
document_content = document_text.query("block_is_content == True & (page_class == 'content' | page_class == 'title')")

In [37]:
document_text[['page', 'page_class']].drop_duplicates()

Unnamed: 0,page,page_class
0,0,title
8,1,content
22,2,content
36,3,content
72,4,content
98,5,content
128,6,content
148,7,content
164,8,content
174,9,content


#### Title

In [38]:
# Find title:
headline_index = document_text.sort_values('font_size', ascending = False).query("page_class == 'title' & text_length > 15").head(1).index.values[0]

In [39]:
title = document_text.iloc[headline_index]['text'].title()

In [40]:
title

'How To Maximize The Value Of Iiot And Iot Strategy'

#### Main copies

In [41]:
# Find copies 

In [42]:
total_words = document_content.sort_values('text_length', ascending = False)['text_length'].sum()

# take paragraphs until 80% of Word Count is reached

word_threshold = round(total_words * 0.80)

main_copies_index = word_threshold > document_content.sort_values('text_length', ascending = False)['text_length'].cumsum()

In [43]:
main_copies_df = document_content.sort_values('text_length', ascending = False)[main_copies_index]

In [44]:
main_copies_df = main_copies_df.sort_values(['page', 'block', 'paragraph'], ascending = True)

In [45]:
main_copies_text = main_copies_df['text'].str.cat()

### Output

In [46]:
main_copies_text

'Over the past half-decade, companies in manufacturing-related industries have begun to tap the potential of the digital factory value chain-and, in so doing, have begun to reinvent and transform their operations and the larger value chain. By championing digital connectivity, these enterprises have created digital transformation initiatives that allow them to measure and optimize their processes via quantitative means rather than just qualitative means. The digital factory value chain has shown to generate tremendous gains in efficiency and output along with improved health and safety on the manufacturing floor. Many manufacturers and enterprises in other industries that rely on operational connectivity have invested heavily in digital factory initiatives over the past five years. This trend is clearly visible in the introduction of Industrie 4. 0 in Europe. In fact, digital factory initiatives across industries are projected to contribute significantly to the global GDP over the next

In [None]:
! rm ..files/*