In [37]:
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator

from elasticsearch import helpers, Elasticsearch

import concurrent.futures

import time
import os
import hashlib

In [38]:
def exportPDF(filename,index='test!pdf'):
    BUF_SIZE = 65536 
    print(filename)
    
    fp = open(filename, 'rb')
    
    sha1Hash = hashlib.sha1()
    while True:
        data = fp.read(BUF_SIZE)
        if not data:
            break
        sha1Hash.update(data)
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    num_page = 0

    for page in pages:
        num_page += 1
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                doc = {
                    'x0': lobj.bbox[0],
                    'y0': lobj.bbox[1],
                    'x1':lobj.bbox[2],
                    'y1':lobj.bbox[3],
                    'npage':num_page,
                    'filename':filename,
                    'sha1':sha1Hash.hexdigest(),                    
                    'text':lobj.get_text()
                }
                export2elk(index,'pdf',doc)

In [39]:
def export2elk(index,doctype,doc):
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    res = es.index(index=index, doc_type=doctype, body=doc)

In [40]:
Elasticsearch([{'host': 'localhost', 'port': 9200}]).indices.delete(index='new-pdf', ignore=[400, 404])
start = time.time()
for filename in os.listdir('./pdfsamples/'):
    exportPDF('./pdfsamples/'+filename,'new-pdf')
finish = time.time()
print(finish-start)

./pdfsamples/kiid.pdf
./pdfsamples/libor.pdf
./pdfsamples/prospectus.PDF
20.26908302307129


In [None]:
start = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    future_to_export = {executor.submit(exportPDF, './pdfsamples/'+filename,'new-pdf'): filename for filename in os.listdir('./pdfsamples/')}
finish = time.time()
print(finish-start)

./pdfsamples/kiid.pdf
./pdfsamples/libor.pdf
./pdfsamples/prospectus.PDF
