In [8]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from collections import defaultdict
from math import *
import pdfminer
import numpy as np
import json

In [2]:
def createPDFDoc(fp):
    parser = PDFParser(fp)
    document = PDFDocument(parser, password='')
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise ValueError("Not extractable")
    else:
        return document

In [3]:
def createDeviceInterpreter():
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return device, interpreter

In [6]:
fp = open('1908.07836.pdf', 'rb')
document = createPDFDoc(fp)  # It will close the file, so no need of fp.close()
device, interpreter = createDeviceInterpreter()
pages = PDFPage.create_pages(document)
interpreter.process_page(next(pages))
layout = device.get_result()

In [82]:
def parse_obj(objs):
    js = defaultdict(lambda: defaultdict(lambda: {}))
    for obj in objs:
        # if it's a container, recurse
        if isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)
        elif isinstance(obj, pdfminer.layout.LTTextBox):
            for o in obj._objs:
                if isinstance(o, pdfminer.layout.LTTextLine):
                    text = o.get_text().strip()
#                     if text.replace("\n", "").replace(" ", "") == "": continue
                    x0, y0 = int(o.x0), int(o.y0)
                    if text not in js[y0][x0]:
                        js[y0][x0][text] = {}
                    js[y0][x0] = {
                        "bbox": o.bbox,
                        "height": floor(o.height),
                        "width": o.width,
                        "text": o.get_text()
                    }

    return js

In [83]:
js = parse_obj(layout._objs)
font_size = sorted(set([js[y0][x0]["height"] for y0 in js for x0 in js[y0]]))
max_widths = max([js[y0][x0]["width"] for y0 in js for x0 in js[y0] 
                  if js[y0][x0]["height"] not in [font_size[-1], font_size[-2]] ])
side_by_side = True if max_widths < layout.width * 0.65 else False
font_size, max_widths, side_by_side

([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 28], 255.9717979000003, True)

In [131]:
l_sections, r_sections = [("", (0, np.inf))], [("", (0, np.inf))]
for y0 in reversed(sorted(js)):
    for x0 in sorted(js[y0]):
        text = js[y0][x0]["text"].strip()
        if len(text.strip()) <= 1: continue
        if x0 < max_widths:
            if l_sections[-1][1][1] - y0 > 18: text = '\n\n' + text.strip()
            l_sections.append((text, (x0, y0)))
        else: 
            if r_sections[-1][1][1] - y0 > 18: text = '\n\n' + text.strip()
            r_sections.append((text, (x0, y0)))

In [132]:
# print(" ".join([text for text, _ in l_sections]))
print(" ".join([text for text, _ in r_sections]))
# r_sections

 

Jianbin Tang Antonio Jimeno Yepes IBM Research Australia 60 City Road, Southbank VIC 3006, Australia jbtang@au1.ibm.com antonio.jimeno@au1.ibm.com 

a slow and expensive process, which is a stepping curve when willing to use these techniques in new domains. In this work, we propose a method to automatically annotate the document layout of over 1 million PubMed Central™ PDF articles and generate a high-quality document layout dataset called PubLayNet. The dataset contains over 360k page samples and covers typical document layout elements such as text, title, list, ﬁgure, and table. Then, we evaluate deep object detection neural networks on the PubLayNet dataset and the performance of ﬁne tuning the networks on existing small manually annotated corpora. We show that the automatically annotated dataset is suitable to train a model to recognize the layout of scientiﬁc articles, and the model pre-trained on the dataset can be a more effective base in transfer learning. 

II. RELATED WORK