In [1]:
%matplotlib inline 
%config InlineBackend.figure_format = 'retina'

In [13]:
import json

In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
    LAParams,
    LTContainer,
    LTTextLine,
    LTFigure
)

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

In [4]:
from pdf2image import convert_from_path, convert_from_bytes

In [5]:
import pdfminer.high_level

In [7]:
def extract_objects(layout, extracted_objects):
    """ extract text,image recursively """
    if not isinstance(layout, LTContainer):
        return
    
    for obj in layout:
        if isinstance(obj, LTTextLine):
            extracted_objects.append({
                    "type": "text",
                    "text": obj.get_text(),
                    "bbox": {
                        "x1": obj.bbox[0],
                        'x2': obj.bbox[1],
                        'y1': obj.bbox[2],
                        'y2': obj.bbox[3]
                    }
                })
            
            # recursive call
            extract_objects(obj, extracted_objects)
            
        elif isinstance(obj, LTFigure):
            extracted_objects.append({
                    "type": "image",
                    "bbox": {
                        "x1": obj.bbox[0],
                        'x2': obj.bbox[1],
                        'y1': obj.bbox[2],
                        'y2': obj.bbox[3]
                    }
                })
            # 画像は，ベクターデータが再帰的に定義されているっぽいので，全体一つだけとる．

In [10]:
path = "./temp/f3894452-5c04-4623-b91c-10b7e70d07a2.pdf"

with open(path, "rb") as f:
    parser = PDFParser(f)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    laparams = LAParams(all_texts=True)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    pages = list(PDFPage.create_pages(document))
    extracted_page_data = []

    for page_no, page in enumerate(pages):
        interpreter.process_page(page)
        layout = device.get_result()
        
        contents = []
        extract_objects(layout, contents)
                
        page_data = {
            "bbox": {
                "x1": page.mediabox[0],
                "x2": page.mediabox[2],
                "y1": page.mediabox[1],
                "y2": page.mediabox[3],
            },
            "contents": contents
        }
        extracted_page_data.append(page_data)



In [None]:
images = convert_from_path(path)

In [None]:
type(images[0])

In [None]:
plt.imshow(np.array(images[3]))

In [None]:
image_x1 = 80.047
image_x2 = 527.239
image_y1 = 531.945
image_y2 = 735.930

w = image_x2 - image_x1
h = image_y2 - image_y1

In [None]:
import matplotlib.patches as patches

In [None]:
fig = plt.figure()
ax = plt.axes()

r = patches.Rectangle(xy=(0, 0), width=612, height=792, ec='#111111', fill=False)
r1 = patches.Rectangle(xy=(image_x1, image_y1), width=w, height=h, ec='#FF0000', fill=False)

ax.add_patch(r)
ax.add_patch(r1)


ax.set_xlim(0,612)
ax.set_ylim(0,792)

ax.set_aspect('equal')