# Whitepaper Text Extraction

In [1]:
import pandas as pd
import json
import re
from pandas.io.json import json_normalize
from os import listdir

## Initialisierung

### Abfrage der Vision API

In [2]:
!gcloud ml vision detect-text-pdf "gs://mastertradr-pdf-extraction/2/Blockchain_WhitePaper_Grundlagen-Anwendungen-Potentiale.pdf" gs://mastertradr-pdf-extraction/2/

{
  "name": "projects/mastertradr-48fc5/operations/8db64411744dce11"
}


### Kopieren der JSON-Outputs in lokales Verzeichnis

In [None]:
# Check if documents exist
!gsutil ls -r gs://mastertradr-pdf-extraction/2/**

In [None]:
!gsutil cp -r gs://mastertradr-pdf-extraction/2/* 2

### Lade JSON Objekt

In [3]:
files = listdir('2')
json_files = list(filter(lambda x: x.endswith(".json"), files))
json_files.sort()
pdf_files = list(filter(lambda x: x.endswith(".pdf"), files))[0]

In [4]:
pdf_content_json = json.load(open('2/'+json_files[0]))

## Processing JSON

### Abmessungen der Seiten im Dokument

In [7]:
# Wieviele Seiten?
page_count = len(pdf_content_json['responses'])

In [8]:
# Wie groß ist jede Seite?
cols = ['height', 'width', 'page']
data = [] 

for page in range(page_count):
    path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages']
    vals = json_normalize(path)[['height', 'width']].values.tolist()[0]
    vals.append(page)
    data.append(dict(zip(cols, vals)))
    
page_dimensions_df = pd.DataFrame(data)
#page_dimensions_df

## Umfang: Wie viele Elemente sind auf jeder Seite?

### Wieviele Textblöcke (Blocks) pro Seite

In [9]:
# Wie viele Blöcke pro Seite?
data = []
cols = ['block_count', 'page']

for page in range(page_count):
    path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks']
    vals = [len(path)]
    vals.append(page)
    data.append(dict(zip(cols, vals)))
    
block_counts_df = pd.DataFrame(data)
#block_counts_df

### Wieviele Paragraphen pro Block pro Seite?

In [10]:
# Wie viele Paragraphen pro Block Seite?
data = []
cols = ['paragraph_count','block_count', 'page']

for page in range(page_count):
    for block in range(block_counts_df.query('page == ' + str(page))['block_count'].values[0]):
        path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block]['paragraphs']
        vals = [len(path)]
        vals.append(block)
        vals.append(page)
        data.append(dict(zip(cols, vals)))
    
paragraph_counts_df = pd.DataFrame(data)
#paragraph_counts_df

### Wieviele Wörter pro Paragraph pro Block pro Seite?

In [11]:
# Wie viele Wörter pro Paragraphen pro Block pro Seite?
data = []
cols = ['word_count', 'paragraph','block', 'page']

for page in range(page_count):
    for block in range(block_counts_df.loc[page,'block_count']):
        for paragraph in range(paragraph_counts_df.query( ('page == ' + str(page) + (' & block_count == ' + str(block))))['paragraph_count'].values[0]):
            path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block]['paragraphs'][paragraph]['words']
            vals = [len(path)]
            vals.append(paragraph)
            vals.append(block)
            vals.append(page)
            data.append(dict(zip(cols, vals)))
    
word_counts_df = pd.DataFrame(data)
#word_counts_df

## Eigenschaften der Textkörper

### Eigenschaften aller Textblöcke (Koordinaten, Konfidenz)

In [12]:
# Wie sind die Abmessungen jedes Blocks pro Seite?
# Beispiel-Output
page = 0
block = 1
path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block]['boundingBox']['normalizedVertices'][:4]
#path

In [13]:
# Mit Konfidenz
path_normalized = json_normalize(pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block], ['boundingBox', 'normalizedVertices'], meta = 'confidence')
#path_normalized

In [14]:
# Beispiel für ein einzelnes flaches Objekt:

path = json_normalize(pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block], ['boundingBox', 'normalizedVertices'], meta = 'confidence')

confidence = path['confidence'].drop_duplicates().item()
data = path.reset_index(drop = True).drop('confidence', axis = 1)

data['position'] = data.index+1
data = data.melt(id_vars=['position'])
data['position'] = data['variable'] + data['position'].map(str) + '-block'

data['block'] = block

data = data.pivot(index = 'block', columns = 'position', values = 'value')
data['confidence-block'] = confidence

#data

In [15]:
# Loop durch alle Blöcke im Dokument. Output = Seite, Block-Nummer, Block-Koordinaten und Confidence
blocks_on_pages_df = []
cols = ['x1-block', 'x2-block', 'x3-block', 'x4-block','y1-block', 'y2-block','y3-block', 'y4-block', 'confidence', 'block', 'page']

for page in range(page_count):
    for block in range(block_counts_df.loc[page,'block_count']):
        path = json_normalize(pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block], ['boundingBox', 'normalizedVertices'], meta = 'confidence')
        confidence = path['confidence'].drop_duplicates().item()
        data = path.reset_index(drop = True).drop('confidence', axis = 1)
        data['position'] = data.index
        data = data.melt(id_vars=['position'])
        data['position'] = data['variable'] + data['position'].map(str) + '-block'
        data['block'] = block
        data = data.pivot(index = 'block', columns = 'position', values = 'value')
        data['confidence-block'] = confidence

        vals = data.values.tolist()[0]
        vals.append(block)
        vals.append(page)
        blocks_on_pages_df.append(dict(zip(cols, vals)))

blocks_on_pages_df = pd.DataFrame(blocks_on_pages_df)
#blocks_on_pages_df

### Extrahieren der Wörter aus den Paragraphen

In [16]:
def text_from_paragraph(page, block, paragraph, word_count):
    "Concatenates all text symbols to word elements to a text within a given paragraph"
    
    paragraph_text = []
    
    for word in range(word_count):
        path = pdf_content_json['responses'][page]['fullTextAnnotation']['pages'][0]['blocks'][block]['paragraphs'][paragraph]['words'][word]
        word_str = json_normalize(path, 'symbols')['text'].str.cat()
        paragraph_text.append(word_str)
    
    paragraph_text = ' '.join(paragraph_text)
    paragraph_text = re.sub(" \\. ", ". ", paragraph_text) # Fix me! Use Regex instead
    paragraph_text = re.sub(" \\.", ".", paragraph_text)
    paragraph_text = re.sub(" , ", ", ", paragraph_text)
    paragraph_text = re.sub(" ! ", "! ", paragraph_text)
    paragraph_text = re.sub(" \\? ", "? ", paragraph_text)
    paragraph_text = re.sub(" ' ", "`", paragraph_text)
    paragraph_text = re.sub(" - ", "-", paragraph_text)
    paragraph_text = re.sub(" \\( ", " (", paragraph_text)
    paragraph_text = re.sub(" \\) ", ") ", paragraph_text)
    paragraph_text = re.sub("\\.", ". ", paragraph_text)
    paragraph_text = re.sub("\\.  ", ". ", paragraph_text)

    
    return(paragraph_text)


In [17]:
# Text aus jedem Paragraphen extrahieren
data = []
cols = ['text', 'paragraph','block', 'page']

for page in range(page_count):
    for block in range(block_counts_df.loc[page,'block_count']):
        for paragraph in range(paragraph_counts_df.query( ('page == ' + str(page) + (' & block_count == ' + str(block))))['paragraph_count'].values[0]):
                word_count = word_counts_df.query( ('page == ' + str(page) + (' & block == ' + str(block) + (' & paragraph == ' + str(paragraph)))))['word_count'].values[0]
                vals = [text_from_paragraph(page, block, paragraph, word_count)]
                vals.append(paragraph)
                vals.append(block)
                vals.append(page)
                data.append(dict(zip(cols, vals)))
    
text_df = pd.DataFrame(data)
#text_df

### Join everything together

In [18]:
document_text = pd.merge(text_df, blocks_on_pages_df, on = ['page', 'block'])
document_text = pd.merge(document_text, page_dimensions_df, on = 'page')

## Feature Engineering

In [19]:
def is_content_area(x1, x2, x3, x4, y1, y2, y3, y4):
    if ( (x1 > 0.05) & (x4 > 0.05) & (y1 > 0.05) & (y4 < 0.9) ):
        return(True)
    else:
        return(False)
    

In [20]:
document_text['block_height'] = round( (document_text['y4-block']*document_text['height']) - (document_text['y1-block']*document_text['height']), 2 )
document_text['block_width'] = round( (document_text['x2-block']*document_text['width']) - (document_text['x1-block']*document_text['width']), 2 )
document_text['block_area'] = (document_text['block_width'] * document_text['block_height']) / (document_text['width'] * document_text['height'])*100

document_text['text_length'] = document_text.apply( lambda row: len(row['text']), axis = 1)

document_text['font_size'] = document_text['block_area'] / document_text['text_length']

document_text['block_is_content'] = document_text.apply( lambda row: is_content_area(*row[['x1-block', 'x2-block', 'x3-block', 'x4-block', 'y1-block', 'y2-block', 'y3-block', 'y4-block']]), axis = 1)

In [21]:
document_text.head(10)

Unnamed: 0,block,page,paragraph,text,confidence,x1-block,x2-block,x3-block,x4-block,y1-block,...,y3-block,y4-block,height,width,block_height,block_width,block_area,text_length,font_size,block_is_content
0,0,0,0,7 Fraunhofer,0.92,0.070588,0.467227,0.467227,0.070588,0.036861,...,0.098692,0.098692,841,595,52.0,236.0,2.452463,12,0.204372,False
1,1,0,0,FIT,0.95,0.420168,0.458824,0.458824,0.420168,0.096314,...,0.122473,0.122473,841,595,22.0,23.0,0.10112,3,0.033707,True
2,2,0,0,FRAUNHOFER INSTITUTE FOR APPLIED INFORMATION T...,0.99,0.057143,0.647059,0.647059,0.057143,0.196195,...,0.209275,0.209275,841,595,11.0,351.0,0.77159,59,0.013078,True
3,3,0,0,"BLOCKCHAIN : GRUNDLAGEN, ANWENDUNGEN UND POTEN...",0.99,0.053782,0.731092,0.731092,0.053782,0.282996,...,0.335315,0.335315,841,595,44.0,403.0,3.543601,51,0.069482,True
4,4,0,0,White Paper,0.99,0.052101,0.14958,0.14958,0.052101,0.357907,...,0.368609,0.36742,841,595,8.0,58.0,0.092727,11,0.00843,True
5,5,0,0,FINTECH,0.99,0.847059,0.944538,0.944538,0.847059,0.608799,...,0.624257,0.624257,841,595,13.0,58.0,0.150681,7,0.021526,True
6,6,0,0,FINTECH,0.99,0.215126,0.310924,0.310924,0.215126,0.659929,...,0.677765,0.677765,841,595,15.0,57.0,0.170865,7,0.024409,True
7,7,0,0,Credit,0.99,0.744538,0.781513,0.781513,0.744538,0.665874,...,0.674197,0.674197,841,595,7.0,22.0,0.030776,6,0.005129,True
8,0,1,0,"BLOCKCHAIN : GRUNDLAGEN, ANWENDUNGEN UND POTEN...",0.99,0.115966,0.791597,0.791597,0.115966,0.079667,...,0.13912,0.13912,841,595,50.0,402.0,4.016827,51,0.078761,True
9,1,1,0,WHITE PAPER,0.99,0.112605,0.221849,0.221849,0.112605,0.153389,...,0.161712,0.161712,841,595,7.0,65.0,0.090928,11,0.008266,True


In [None]:
# Categorize Pages: Title, Author, Copyright, Disclaimer, Content
# Use Semantic API
# Analyse Text length or paragraph length

### Klassifiziere Seiten

In [None]:
# Categorize paragraphs: Headline, Copy, Misc.

Kategorisierung der Seiten: Titelseite, Content-Seite, Disclaimer, Bild

Aggregation aller Textparagraphen auf einer Seite zu einem Text. Anschließend Analyse dieser Texte und extrakt.

In [22]:
document_pages_df = document_text.groupby(['page']).agg({
    'block': 'max',
    'text_length': 'sum',
    'text': lambda x: x.str.cat(sep=' ')})

Textlänge in %

In [23]:
document_pages_df['page_text_length_perc']= document_pages_df['text_length']/document_pages_df['text_length'].sum()

Klassifiziere Pages anhand fester Regeln: Erste Seite ist Titel, Seiten mit unter 1% Textanteil sind nur Bild

In [24]:
document_pages_df['page_class'] = ""

In [25]:
document_pages_df.loc[document_pages_df.page_text_length_perc < 0.01, 'page_class'] = 'empty'

In [26]:
document_pages_df.loc[0, 'page_class'] = 'title'

Get Topics per Page

Klassifiziere Legal-Seiten (Disclaimer, etc.)

In [27]:
#pip install textrazor
import textrazor
textrazor.api_key = "726511cf75ca2a4ea269cbf8a44c2b3773e5b805e05a9c82e35a98fc"

client = textrazor.TextRazor(extractors=["topics"])
page_topics = []

for page in document_pages_df.index:
    text = document_pages_df.loc[page, 'text']
    response = client.analyze(text)
    page_topics.append(json_normalize(response.json['response'], 'topics')[:15])
    #response = client.analyze(document_pages_df.loc[page, 'text'])

In [28]:
# Loop durch alle topics pro seite und aufsummieren der scores für Law-Labels

legal_pages = []
cols = ['page', 'score']

for page in document_pages_df.index:
    vals = []
    vals.append(page)
    vals.append(page_topics[page].query("label == 'Private law' | label == 'Common law'")['score'].sum())
    
    legal_pages.append(dict(zip(cols, vals)))

legal_pages = pd.DataFrame(legal_pages).query('score > 0')['page']


In [29]:
document_pages_df.loc[legal_pages, 'page_class'] = 'legal'

In [30]:
# Identify Page with ToC
toc_page = document_text[['text', 'page']].copy()
toc_page['text'] = toc_page['text'].str.lower()
toc_page = toc_page.query("text == 'table of contents' | text == 'table of content' | text == 'inhalt' | text == 'inhaltsverzeichnis' | text == 'content'")['page']

In [31]:
document_pages_df.loc[toc_page, 'page_class'] = 'toc'

In [32]:
# Alles andere als Content

In [33]:
document_pages_df.loc[document_pages_df.page_class == "", 'page_class'] = "content"

In [34]:
document_pages_df = document_pages_df.reset_index()

In [35]:
document_pages_df = document_pages_df[['page_class', 'page', 'page_text_length_perc']]

In [36]:
document_text = document_text.merge(document_pages_df, on = 'page')

In [37]:
document_content = document_text.query("block_is_content == True & (page_class == 'content' | page_class == 'title')")

### Extract Metadata

#### Erstellungsdatum

In [None]:
#pip install pypdf2

In [38]:
from PyPDF2 import PdfFileReader

In [39]:
def get_pdf_info(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
  
    author = info.author
    creator = info.creator
    producer = info.producer
    subject = info.subject
    title = info.title

In [40]:
fp = open('2/'+pdf_files, 'rb')

pdfFile = PdfFileReader(fp)
if pdfFile.isEncrypted:
    try:
        pdfFile.decrypt('')
        print('File Decrypted (PyPDF2)')
    except:
        command = ("cp "+ filename +
            " temp.pdf; qpdf --password='' --decrypt temp.pdf " + filename
            + "; rm temp.pdf")
        os.system(command)
        print('File Decrypted (qpdf)')
        fp = open(filename)
        pdfFile = PdfFileReader(fp)
        info = pdfFile.getDocumentInfo()
else:
        info = pdfFile.getDocumentInfo()
    
    






In [41]:
# Datum
from datetime import datetime
date = datetime.strptime(info['/CreationDate'][2:16], '%Y%m%d%H%M%S').strftime('%Y-%m-%d')
date

'2018-01-09'

In [42]:
# Autor
try:
    author = info['/Author']
except:
    author = ""

author

''

#### Title

In [43]:
# Find title:
headline_index = document_text.sort_values('font_size', ascending = False).query("page_class == 'title' & text_length > 15").head(1).index.values[0]

In [44]:
title = document_text.iloc[headline_index]['text'].title()

In [45]:
title

'Blockchain : Grundlagen, Anwendungen Und Potenziale'

#### Main copies

In [72]:
# Find copies 

In [46]:
total_words = document_content.sort_values('text_length', ascending = False)['text_length'].sum()

# take paragraphs until 80% of Word Count is reached

word_threshold = round(total_words * 0.80)

main_copies_index = word_threshold > document_content.sort_values('text_length', ascending = False)['text_length'].cumsum()

In [47]:
main_copies_df = document_content.sort_values('text_length', ascending = False)[main_copies_index]

In [48]:
main_copies_df = main_copies_df.sort_values(['page', 'block', 'paragraph'], ascending = True)

In [49]:
main_copies_text = main_copies_df['text'].str.cat()

#### Long description

In [None]:
# Create text excerpt

In [50]:
# First n words of main copies
import textwrap
n = 3000
long_description = textwrap.shorten(main_copies_text, n)

#### Short description

In [None]:
# Bei SSL-Problemen:
#pip uninstall pyOpenSSL -y

In [51]:
import requests
meaningcloud_license_key = 'c0ece6dcd78b7f84e02870ab2c92ebb4'

url = "https://api.meaningcloud.com/summarization-1.0"

#text = re.sub('[^A-Za-züÜäÄöÖß0-9\s\.,]+', '', main_copies_text)
#text = re.sub('[^A-Za-z0-9\s\.,]+', '', main_copies_text)
text = main_copies_text

summary_length = str(3)

payload = str("key="+license_key+"&txt="+text+"&sentences="+summary_length).encode('utf-8')
headers = {'content-type': 'application/x-www-form-urlencoded'}

try:
    # We are going to make a request to the Topics Extraction API
    response = requests.request("POST", url, data=payload, headers=headers)
    short_description = response.json()['summary']
    
except:
    print(response.json()['status']['msg'])


#### Topics

In [None]:
#import textrazor
#textrazor.api_key = "726511cf75ca2a4ea269cbf8a44c2b3773e5b805e05a9c82e35a98fc"

#client = textrazor.TextRazor(extractors=["topics"])
#response = client.analyze(main_copies_text)
#topics = json_normalize(response.json['response'], 'topics').query('score > 0.9')[:10]['label']

In [None]:
# Nimm alle page topics von pages die als content oder titel markiert sind

In [52]:
content_pages_index = document_text.query("page_class == 'title' | page_class == 'content'")['page'].unique()

#content_topics = [dict(zip(document_pages_df[['page', 'page_text_length_perc']], page_topics)) for i in content_pages_index]


In [53]:
content_pages_index

array([ 0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])

In [54]:
content_topics = [page_topics[i] for i in content_pages_index]

In [55]:
content_topics = pd.concat(content_topics, keys = content_pages_index ).reset_index()

In [56]:
content_topics['page'] = content_topics['level_0']

In [57]:
content_topics = content_topics[['page', 'label', 'score']]

In [58]:
content_topics = content_topics.merge(document_pages_df[['page', 'page_text_length_perc']], on = 'page')

In [59]:
content_topics['topic_weight'] = content_topics['score'] * content_topics['page_text_length_perc']

In [60]:
content_topics = (content_topics
                  .groupby('label')
                  .agg({'topic_weight': 'sum'})
                  .sort_values('topic_weight', ascending = False)
                  .reset_index()[:15])

In [61]:
topics = content_topics[:10]

### Summary

In [62]:
title

'Blockchain : Grundlagen, Anwendungen Und Potenziale'

In [63]:
short_description

'Längst ist die Blockchain mehr als nur die Technologie hinter der Kryptowährung Bitcoin. Grundsätzlich ist die Blockchain ein elektronisches Register für digitale Datensätze, Ereignisse oder Transaktionen, die durch die Teilnehmer eines verteilten Rechnernetzes verwaltet werden. [...] in den Transaktionen inkludier-ten Transaktionsgebühren für jeden gefundenen Block, der in die Blockchain aufgenommen wird, eine gewisse Anzahl anZudem können Systeme darin unterschieden werden, auf welche Weise ein Konsens über den Systemstatus erreicht wird.'

In [64]:
long_description

'Längst ist die Blockchain mehr als nur die Technologie hinter der Kryptowährung Bitcoin. Vielmehr wird die Technologie mittlerweile als die eigentliche Innovation erachtet, die Experten zufolge das Potenzial hat, etliche Bereiche der Gesellschaft, die weit über das Gebiet digitaler Währungen hinausgehen, zu verändern. Nicht zuletzt aufgrund der vielfältigen Einsatzmöglich keiten rückt sie zunehmend in den Fokus der Öffentlichkeit. Grundsätzlich ist die Blockchain ein elektronisches Register für digitale Datensätze, Ereignisse oder Transaktionen, die durch die Teilnehmer eines verteilten Rechnernetzes verwaltet werden. Im Rahmen der vorliegenden Studie werden Status Quo der Forschung aufgearbeitet, eine the oretische Einordung der Technologie vorgenommen, Blockchain-Anwendungen untersucht so wie die aktuellen Entwicklungen in der Praxis analysiert. Dazu hat das Fraunhofer FIT und seine Projektgruppe Wirtschaftsinformatik unter anderem eine Marktanalyse von Blockchain-Startups durchgefü

In [65]:
topics

Unnamed: 0,label,topic_weight
0,Computing,0.809215
1,Technology,0.709845
2,Information technology management,0.621517
3,Blockchain,0.600401
4,Bitcoin,0.589768
5,Financial cryptography,0.584562
6,Blockchains,0.544899
7,Areas of computer science,0.502687
8,Information technology,0.497081
9,Cryptocurrencies,0.425044


In [66]:
author

''

In [67]:
date

'2018-01-09'

In [None]:
! rm data/*.json
! gsutil rm gs://mastertradr-pdf-extraction/json/*