# 01_Extract Text (from PDF files)

### Impoart Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
from nltk.tokenize import sent_tokenize

In [3]:
import fitz

### Define File Paths

In [4]:
path_pdf  = 'Reports'
fname_out = 'sentences.csv'

### Define Function: Get count of words (only alphabets)

In [5]:
def get_cnt(text):
    cnt = 0
    for word in text.split():
        if word.isalpha():
            cnt += 1
    return cnt

### Define Function: Get text from blocks

In [6]:
def get_text(block_lst):

    MIN_WORD_CNT = 10
    
    text_lst = []
    for block in block_lst:
        if block[6] != 0: continue # block_type: 0 = text
    
        text = block[4]
        text = text.replace('fi ', 'fi') # PyMuPDF(fitz) bug fix: 'fi ' --> 'fi'
    
        if get_cnt(text) < MIN_WORD_CNT: continue # Delete sentences with less than MIN_WORD_CNT(10) 
    
        text_lst.append(text.replace('-\n', ''))
        
    return ('\n'.join(text_lst))

### Define Function: Get sentences from PDF files

In [7]:
def get_sentence(fname, skip_page = (0,)):
 
    doc = fitz.open(fname)
    
    sent_lst = []
    
    for page_no, page in enumerate(doc):

        # Skip page
        if page_no+1 in skip_page: continue
        block_lst = page.get_text('blocks')
        text = get_text(block_lst)
    
        for i, sentence in enumerate(sent_tokenize(text)):
            r_sent = ' '.join(sentence.split()) # Delete '\n', '\t' and strip
            sent_lst.append(r_sent)
            
    doc.close()

    return sent_lst

### Define Function: Generate document (DataFrame)

In [8]:
def gen_document(doc_id, fname, sent_lst):

    res_df = pd.DataFrame(
        {
            'doc_id': doc_id,
            'fname': fname,
            'sentence': sent_lst
        }
    )
    
    return res_df

### Define Function: Read PDF file list

In [9]:
def read_filelist(path):

    # Create empty DataFrame
    df = pd.DataFrame()
    
    # Read file list (directory)
    for idx, fname in enumerate(os.listdir(path)):
        p_fname = os.path.join(path, fname)
        #p_fname = path +'/'+fname
        print('path + fname >>>', p_fname)
        
        if p_fname.split('.')[-1] != 'pdf': continue
        print('fname >>>',fname)
    
        doc_id = int(idx)
        
        print(f'doc_id = [{doc_id}], fname = [{fname}]')
        print('')
    
        sent_lst = get_sentence(p_fname)
        df_doc   = gen_document(doc_id, fname, sent_lst)
        
        df = pd.concat([df,df_doc])
        
    return df

### Create DataFrame from PDF Files

In [10]:
%%time
df = read_filelist(path_pdf)
print('==== End of jobs ====')

path + fname >>> reports\Aldi_2020.pdf
fname >>> Aldi_2020.pdf
doc_id = [0], fname = [Aldi_2020.pdf]

path + fname >>> reports\Aldi_2021.pdf
fname >>> Aldi_2021.pdf
doc_id = [1], fname = [Aldi_2021.pdf]

path + fname >>> reports\Allianz_2020.pdf
fname >>> Allianz_2020.pdf
doc_id = [2], fname = [Allianz_2020.pdf]

path + fname >>> reports\Allianz_2021.pdf
fname >>> Allianz_2021.pdf
doc_id = [3], fname = [Allianz_2021.pdf]

path + fname >>> reports\amazon_2020.pdf
fname >>> amazon_2020.pdf
doc_id = [4], fname = [amazon_2020.pdf]

path + fname >>> reports\amazon_2021.pdf
fname >>> amazon_2021.pdf
doc_id = [5], fname = [amazon_2021.pdf]

path + fname >>> reports\Asklepios_2020.pdf
fname >>> Asklepios_2020.pdf
doc_id = [6], fname = [Asklepios_2020.pdf]

path + fname >>> reports\Asklepios_2021.pdf
fname >>> Asklepios_2021.pdf
doc_id = [7], fname = [Asklepios_2021.pdf]

path + fname >>> reports\Axel Springer_2020.pdf
fname >>> Axel Springer_2020.pdf
doc_id = [8], fname = [Axel Springer_2020.p

path + fname >>> reports\merck_2020.pdf
fname >>> merck_2020.pdf
doc_id = [73], fname = [merck_2020.pdf]

path + fname >>> reports\merck_2021.pdf
fname >>> merck_2021.pdf
doc_id = [74], fname = [merck_2021.pdf]

path + fname >>> reports\Munich Re_2021.pdf
fname >>> Munich Re_2021.pdf
doc_id = [75], fname = [Munich Re_2021.pdf]

path + fname >>> reports\MunichRe_2020.pdf
fname >>> MunichRe_2020.pdf
doc_id = [76], fname = [MunichRe_2020.pdf]

path + fname >>> reports\Philip Morris_2020.pdf
fname >>> Philip Morris_2020.pdf
doc_id = [77], fname = [Philip Morris_2020.pdf]

path + fname >>> reports\Philip Morris_2021.pdf
fname >>> Philip Morris_2021.pdf
doc_id = [78], fname = [Philip Morris_2021.pdf]

path + fname >>> reports\Procter Gamble_2020.pdf
fname >>> Procter Gamble_2020.pdf
doc_id = [79], fname = [Procter Gamble_2020.pdf]

path + fname >>> reports\Procter Gamble_2021.pdf
fname >>> Procter Gamble_2021.pdf
doc_id = [80], fname = [Procter Gamble_2021.pdf]

path + fname >>> reports\pwc_

In [11]:
df

Unnamed: 0,doc_id,fname,sentence
0,0,Aldi_2020.pdf,Reduce total weight of own-brand virgin plasti...
1,0,Aldi_2020.pdf,30% recycled content in own-brand plastic pack...
2,0,Aldi_2020.pdf,2025 New goal.
3,0,Aldi_2020.pdf,100% sustainably certified and/or recycled mat...
4,0,Aldi_2020.pdf,The requirement refers to all own-brand produc...
...,...,...,...
3418,120,ZFfriedrichshafen_2021.pdf,Dieser Bericht liegt in deutscher und englisch...
3419,120,ZFfriedrichshafen_2021.pdf,Beide Fassungen stehen auch im Internet unter ...
3420,120,ZFfriedrichshafen_2021.pdf,Zur besseren Lesbarkeit wird in diesem Geschäf...
3421,120,ZFfriedrichshafen_2021.pdf,Es werden damit gleichermaßen alle Geschlechte...


In [12]:
df.to_csv(fname_out, index=False)

In [14]:
# extract company name
df['company'] = df['fname'].str.split('_').apply(lambda x: x[0])

In [15]:
# extract year of the report
df['year'] = df['fname'].str.split(r'_|\(|\-|\.').apply(lambda x: x[1])

In [16]:
df.head()

Unnamed: 0,doc_id,fname,sentence,company,year
0,0,Aldi_2020.pdf,Reduce total weight of own-brand virgin plasti...,Aldi,2020
1,0,Aldi_2020.pdf,30% recycled content in own-brand plastic pack...,Aldi,2020
2,0,Aldi_2020.pdf,2025 New goal.,Aldi,2020
3,0,Aldi_2020.pdf,100% sustainably certified and/or recycled mat...,Aldi,2020
4,0,Aldi_2020.pdf,The requirement refers to all own-brand produc...,Aldi,2020


In [17]:
df['word_counts'] = df['sentence'].str.split().str.len()
df_word_cnt = df.groupby(['company','year'])['word_counts'].sum().unstack(level=0)

### Export Data to CSV File

In [18]:
df.to_csv("sentences_company_year.csv", index=False)

---

In [13]:
# End of file