In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from collections import defaultdict
from math import *
import pdfminer
import numpy as np
import json

In [2]:
def createPDFDoc(fp):
    parser = PDFParser(fp)
    document = PDFDocument(parser, password='')
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise ValueError("Not extractable")
    else:
        return document

In [3]:
def createDeviceInterpreter():
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return device, interpreter

In [4]:
def parse_obj(objs):
    js = defaultdict(lambda: defaultdict(lambda: {}))
    for obj in objs:
        # if it's a container, recurse
        if isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)
        elif isinstance(obj, pdfminer.layout.LTTextBox):
            for o in obj._objs:
                if isinstance(o, pdfminer.layout.LTTextLine):
                    text = o.get_text().strip()
                    x0, y0 = int(o.x0), int(o.y0)
                    if text not in js[y0][x0]:
                        js[y0][x0][text] = {}
                    js[y0][x0] = {
                        "bbox": o.bbox,
                        "height": floor(o.height),
                        "width": o.width,
                        "text": o.get_text()
                    }

    return js

In [5]:
def get_text(js):
    l_sections, r_sections = [("", (0, np.inf))], [("", (0, np.inf))]
    for y0 in reversed(sorted(js)):
        for x0 in sorted(js[y0]):
            text = js[y0][x0]["text"].strip()
            if len(text.strip()) <= 1: continue
            if x0 < max_widths:
                if l_sections[-1][1][1] - y0 > 18: text = '\n\n' + text.strip()
                l_sections.append((text, (x0, y0)))
            else: 
                if r_sections[-1][1][1] - y0 > 18: text = '\n\n' + text.strip()
                r_sections.append((text, (x0, y0)))
    return l_sections, r_sections

In [6]:
fp = open('1908.07836.pdf', 'rb')
document = createPDFDoc(fp)  # It will close the file, so no need of fp.close()
device, interpreter = createDeviceInterpreter()
pages = PDFPage.create_pages(document)

In [7]:
sections = []

In [8]:
interpreter.process_page(next(pages))
layout = device.get_result()
js = parse_obj(layout._objs)
font_size = sorted(set([js[y0][x0]["height"] for y0 in js for x0 in js[y0]]))
max_widths = max([js[y0][x0]["width"] for y0 in js for x0 in js[y0] 
                  if js[y0][x0]["height"] not in [font_size[-1], font_size[-2]] ])
side_by_side = True if max_widths < layout.width * 0.65 else False
l_sections, r_sections = get_text(js)
sections.extend(l_sections)
sections.extend(r_sections)

In [9]:
for page in pages:
    interpreter.process_page(page)
    layout = device.get_result()
    js = parse_obj(layout._objs)
    l_sections, r_sections = get_text(js)
    sections.extend(l_sections)
    sections.extend(r_sections)

In [10]:
print(" ".join([text for text, _ in sections]))

 

PubLayNet: largest dataset ever for document 

layout analysis 

Xu Zhong IBM Research Australia IBM Research Australia 60 City Road, Southbank 60 City Road, Southbank VIC 3006, Australia VIC 3006, Australia peter.zhong@au1.ibm.com 

Abstract—Recognizing the layout of unstructured digital documents is an important step when parsing the documents into structured machine-readable format for downstream applications. Deep neural networks that are developed for computer vision have been proven to be an effective method to analyze layout of document images. However, document layout datasets that are currently publicly available are several magnitudes smaller than established computing vision datasets. Models have to be trained by transfer learning from a base model that is pre-trained on a traditional computer vision dataset. In this paper, we develop the PubLayNet dataset for document layout analysis by automatically matching the XML representations and the content of over 1 million PDF 