In [24]:
import os
import string
import re
import glob
import zipfile
import datetime
import random
from multiprocess import Process, Manager
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import numpy as np
import pandas as pd

In [None]:
pathes = glob.glob('lexisnexis/zip_files/*.zip')
stocks = list(set(sorted([os.path.basename(x)[:-4].rstrip(string.digits)[:-1] for x in pathes])))

for stock in stocks:
    break
    pathes = glob.glob(f'lexisnexis/zip_files/{stock}*.zip')
    for path in pathes:
        with zipfile.ZipFile(path, 'r') as zip_ref:
            zip_ref.extractall(f'lexisnexis/raw/{stock}')

In [9]:
def pdf_to_dict(path):
    pdf_text_elements = list()
    for page in extract_pages(path):
        for element in page:
            if isinstance(element, LTTextContainer):
                text_element = element.get_text()
                text_element = text_element.replace('\n', '')
                pdf_text_elements.append(text_element)

    article = {
        'company': os.path.basename(os.path.dirname(path)).replace('_', ' '),
        'time_stamp': pdf_text_elements[2],
        'title': pdf_text_elements[0],
        'length': None,
        'text': None,
    }            

    prev_text_element = None
    in_body = False
    for text_element in pdf_text_elements:
        if text_element.startswith('Length:'):
            article['length'] = int(re.findall(r'\d+', text_element)[0])

        if prev_text_element == 'Body':
            in_body = True
        if text_element.startswith('Load-Date:'):
            in_body = False
        if in_body and text_element != article['title']:
            if article['text'] is None:
                article['text'] = text_element
            else:
                article['text'] += ' ' + text_element

        prev_text_element = text_element
        
    if article['text'] is not None:
        article['text'] = ' '.join(article['text'].split())

    return article


def pdfs_to_dict(processed, pathes):
    for path in pathes:
        processed.append(pdf_to_dict(path))

In [43]:
%%time
pathes = sorted(glob.glob('lexisnexis/raw/*/*.pdf'))
random.shuffle(pathes)

with Manager() as manager:
    articles = manager.list()
    processes = []
    n_processes = 14
    pathes = np.array_split(pathes, n_processes)
    for i in range(n_processes):
        p = Process(target=pdfs_to_dict, args=(articles, pathes[i]))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
        
    articles = list(articles)

articles = pd.DataFrame(articles)
display(articles.dtypes)
display(articles)

articles.to_csv('../data/articles_raw.csv', sep=';', index=False)

company        object
time_stamp     object
title          object
length        float64
text           object
dtype: object

Unnamed: 0,company,time_stamp,title,length,text
0,commerzbank ag,"March 4, 2019 Monday 10:00 AM GMT",Commerzbank AG Post-stab Berlin 0.625% EUR 250...,229.0,Post-stabilisation notice 4 March 2019 Not for...
1,ks ag,"February 15, 2021 Monday 1:47 PM GMT",DGAP-Adhoc: DEPFA BANK plc: FMS-WM announces s...,292.0,DGAP-Ad-hoc: DEPFA BANK plc / Key word(s): Mis...
2,morphosys ag,"January 6, 2022 Thursday 1:19 AM EST","INTERNATIONAL PATENT: MORPHOSYS AG, GILEAD SCI...",224.0,"GENEVA, Jan. 6 -- MORPHOSYS AG (Semmelweisstr...."
3,ks ag,"March 24, 2020 Tuesday 6:40 PM GMT",DGAP-Adhoc: Delignit AG: Delignit responds to ...,555.0,Delignit AG: Delignit responds to COVID-19 pan...
4,s&t ag,Plus Company Updates(PCU),Swiss Official Gazette of Commerce notice: HR ...,268.0,Bern: Swiss Official Gazette of Commerce has i...
...,...,...,...,...,...
22389,lanxess ag,"February 1, 2019 Friday",-LANXESS - High-performance prepolymer technol...,690.0,Cologne - Specialty chemicals company LANXESS ...
22390,secunet security networks ag,"August 12, 2020 Wednesday 8:00 AM GMT",DGAP-News: secunet Security Networks AG closes...,1122.0,secunet Security Networks AG closes first half...
22391,dic asset ag,"September 19, 2022 Monday 7:30 AM GMT",EQS-News: DIC Asset AG renews leases for about...,847.0,EQS-News: DIC Asset AG / Key word(s): Real Est...
22392,deutz ag,"August 11, 2022 Thursday 7:18 AM GMT",Deutz AG Half Year Results,1678.0,RNS Number : 6878V Deutz AG 11 August 2022 * O...


CPU times: user 4.08 s, sys: 451 ms, total: 4.53 s
Wall time: 18min 42s


In [44]:
def remove_tz(x):
    if isinstance(x, datetime.datetime):
        x = x.replace(tzinfo=None)
    
    return x

articles = pd.read_csv('../data/articles_raw.csv', sep=';')

articles['time_stamp'] = pd.to_datetime(articles['time_stamp'], errors='coerce', utc=True)
articles['time_stamp'] = articles['time_stamp'].dt.date

display(articles.dtypes)
display(articles)

articles.to_csv('../data/articles_prep.csv', sep=';', index=False)



company        object
time_stamp     object
title          object
length        float64
text           object
dtype: object

Unnamed: 0,company,time_stamp,title,length,text
0,commerzbank ag,2019-03-04,Commerzbank AG Post-stab Berlin 0.625% EUR 250...,229.0,Post-stabilisation notice 4 March 2019 Not for...
1,ks ag,2021-02-15,DGAP-Adhoc: DEPFA BANK plc: FMS-WM announces s...,292.0,DGAP-Ad-hoc: DEPFA BANK plc / Key word(s): Mis...
2,morphosys ag,2022-01-06,"INTERNATIONAL PATENT: MORPHOSYS AG, GILEAD SCI...",224.0,"GENEVA, Jan. 6 -- MORPHOSYS AG (Semmelweisstr...."
3,ks ag,2020-03-24,DGAP-Adhoc: Delignit AG: Delignit responds to ...,555.0,Delignit AG: Delignit responds to COVID-19 pan...
4,s&t ag,NaT,Swiss Official Gazette of Commerce notice: HR ...,268.0,Bern: Swiss Official Gazette of Commerce has i...
...,...,...,...,...,...
22389,lanxess ag,2019-02-01,-LANXESS - High-performance prepolymer technol...,690.0,Cologne - Specialty chemicals company LANXESS ...
22390,secunet security networks ag,2020-08-12,DGAP-News: secunet Security Networks AG closes...,1122.0,secunet Security Networks AG closes first half...
22391,dic asset ag,2022-09-19,EQS-News: DIC Asset AG renews leases for about...,847.0,EQS-News: DIC Asset AG / Key word(s): Real Est...
22392,deutz ag,2022-08-11,Deutz AG Half Year Results,1678.0,RNS Number : 6878V Deutz AG 11 August 2022 * O...
