In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from collections import defaultdict
from math import *
from collections import Counter
import pdfminer
import numpy as np
import nltk

In [2]:
def createPDFDoc(fp):
    parser = PDFParser(fp)
    document = PDFDocument(parser, password='')
    if not document.is_extractable:
        raise ValueError("Not extractable")
    else:
        return document

In [3]:
def createDeviceInterpreter():
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return device, interpreter

In [4]:
def parse_obj(objs, hlen):
    js = defaultdict(lambda: defaultdict(lambda: {}))
    for obj in objs:
        # if it's a container, recurse
        if isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs, hlen)
        elif isinstance(obj, pdfminer.layout.LTTextBox):
            for o in obj._objs:
                if isinstance(o, pdfminer.layout.LTTextLine):
                    text = o.get_text().strip()
                    x0, y0 = int(o.x0), int(o.y0)
                    if text not in js[y0 + hlen][x0]:
                        js[y0 + hlen][x0][text] = {}
                    js[y0 + hlen][x0] = {
                        "bbox": o.bbox,
                        "height": floor(o.height),
                        "width": o.width,
                        "text": o.get_text()
                    }

    return js

In [5]:
def get_side_by_side_text(js, hgap, sentence_width):
    l_sections, r_sections = [("", (0, np.inf))], [("", (0, np.inf))]
    for y0 in reversed(sorted(js)):
        for x0 in sorted(js[y0]):
            text = js[y0][x0]["text"].strip()
            if len(text.strip()) <= 1: continue
            if x0 < sentence_width:
                if l_sections[-1][1][1] - y0 > hgap: text = '\n\n' + text.strip()
                l_sections.append((text, (x0, y0)))
            else:
                if r_sections[-1][1][1] - y0 > hgap: text = '\n\n' + text.strip()
                r_sections.append((text, (x0, y0)))
    return l_sections + r_sections

In [6]:
def get_straight_text(js, hgap):
    sections = [("", (0, np.inf))]
    for y0 in reversed(sorted(js)):
        for x0 in sorted(js[y0]):
            text = js[y0][x0]["text"].strip()
            if len(text.strip()) <= 1: continue
            if sections[-1][1][1] - y0 > hgap: text = '\n\n' + text.strip()
            sections.append((text, (x0, y0)))
    return sections

In [7]:
def clean_sections(s: str):
    L = []
    for part in s.split(" "):
        if True in [token in part for token in ["http://", "https://", ".com"]]:
            continue
        L.append(part)
    return " ".join(L)

In [8]:
fp = open('1908.07836.pdf', 'rb')
document = createPDFDoc(fp)  # It will close the file, so no need of fp.close()
device, interpreter = createDeviceInterpreter()
pages = PDFPage.create_pages(document)

js, hlen = {}, 100000
for page_no, page in enumerate(pages):
    interpreter.process_page(page)
    layout = device.get_result()
    _js = parse_obj(layout._objs, hlen * layout.height)
    if True in [True for y0 in _js for x0 in _js[y0] if "References" in _js[y0][x0]["text"]]:
        break
    js = {**js, **_js}
    hlen -= 1

In [9]:
ys = sorted(js.keys())
sentence_hgap = Counter([ys[i + 1] - ys[i] for i in range(len(ys) - 1)]).most_common(1)[0][0]
sentence_font_size = Counter([js[y0][x0]["height"] for y0 in js for x0 in js[y0]]).most_common(1)[0][0]
sentence_width = np.mean([js[y0][x0]["width"] for y0 in js for x0 in js[y0]
                          if js[y0][x0]["height"] == sentence_font_size])
side_by_side = True if sentence_width < layout.width * 0.65 else False
if side_by_side:
    sections = get_side_by_side_text(js, sentence_hgap, sentence_width)
else:
    sections = get_straight_text(js, sentence_hgap)

In [10]:
sentences_len = [len(sentence) for text, _ in sections for sentence in nltk.sent_tokenize(text)]
min_sentence_length = np.mean(sentences_len)
sectionsL = " ".join([text for text, _ in sections]).split("\n\n")
sectionsL = [" ".join([clean_sections(sentence) for sentence in nltk.sent_tokenize(section)
                       if len(sentence) > min_sentence_length])
             for section in sectionsL]
sectionsL = [section for section in sectionsL
             if section != "" and len(section) > min_sentence_length and len(nltk.sent_tokenize(section)) > 1]

In [11]:
sectionsL

['Abstract—Recognizing the layout of unstructured digital documents is an important step when parsing the documents into structured machine-readable format applications. Deep neural networks that are developed for computer vision have been proven to be an effective method to analyze layout of document images. However, document layout datasets that are currently publicly available are several magnitudes smaller than established computing vision datasets. Models have to be trained by transfer learning from a base model that is pre-trained on a traditional computer vision dataset. In this paper, we develop the PubLayNet dataset for document layout analysis by automatically matching the XML representations and the content of over 1 million PDF articles that are publicly available on PubMed Central™. The size of the dataset is comparable to established computer vision datasets, containing over 360 thousand document images, where typical document layout elements are annotated. The experiment