In [None]:
%matplotlib inline 
%config InlineBackend.figure_format = 'retina'

In [None]:
import pandas as pd

In [None]:
import json
import math

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
    LAParams,
    LTContainer,
    LTTextLine,
    LTFigure
)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
import matplotlib.patches as patches

In [None]:
from pdf2image import convert_from_path, convert_from_bytes

In [None]:
import pdfminer.high_level

In [None]:
def extract_objects(layout, extracted_objects):
    """ extract text,image recursively """
    if not isinstance(layout, LTContainer):
        return
    
    for obj in layout:
        if isinstance(obj, LTTextLine):
            extracted_objects.append({
                    "type": "text",
                    "text": obj.get_text(),
                    "bbox": {
                        "x1": obj.bbox[0],
                        'x2': obj.bbox[2],
                        'y1': obj.bbox[1],
                        'y2': obj.bbox[3]
                    }
                })
            
            # recursive call
            
        elif isinstance(obj, LTFigure):
            extracted_objects.append({
                    "type": "image",
                    "bbox": {
                        "x1": obj.bbox[0],
                        'x2': obj.bbox[2],
                        'y1': obj.bbox[1],
                        'y2': obj.bbox[3]
                    }
                })
        extract_objects(obj, extracted_objects)

In [None]:
_p = []
def extract_pdf(pdf_file_path):
    extracted_page_data = []

    with open(path, "rb") as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        laparams = LAParams(all_texts=True)
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        pages = list(PDFPage.create_pages(document))

        for page_no, page in enumerate(pages):
            interpreter.process_page(page)
            layout = device.get_result()
            
            global _p
            _p.append(page)
            contents = []
            extract_objects(layout, contents)

            page_data = {
                "bbox": {
                    "x1": page.mediabox[0],
                    "x2": page.mediabox[2],
                    "y1": page.mediabox[1],
                    "y2": page.mediabox[3],
                },
                "contents": contents
            }
            extracted_page_data.append(page_data)   
    
    return extracted_page_data

In [None]:
def write_text(page):
    for content in page['contents']:
        
        if content['type'] == 'text':
            print(content['text'])
    print()

In [None]:
# page_no = 0
# plot_page(pages[page_no], pdf_images[page_no])

In [None]:
def calculate_distance_of_two_box(bbox1, bbox2):
    c1x1 = min(bbox1['x1'], bbox1['x2'])
    c1x2 = max(bbox1['x1'], bbox1['x2'])
    c1y1 = min(bbox1['y1'], bbox1['y2'])
    c1y2 = max(bbox1['y1'], bbox1['y2'])
    
    c2x1 = min(bbox2['x1'], bbox2['x2'])
    c2x2 = max(bbox2['x1'], bbox2['x2'])
    c2y1 = min(bbox2['y1'], bbox2['y2'])
    c2y2 = max(bbox2['y1'], bbox2['y2'])
    
    # X distance
    x_d = 0
    if (c1x1 <= c2x1 <= c1x2) or (c2x1 <= c1x1 <= c2x2):
        x_d = 0
    else:
        x_d = min(abs(c1x2-c2x1), abs(c1x1-c2x2))
        
    # Y distance
    y_d = 0
    if (c1y1 <= c2y1 <= c1y2) or (c2y1 <= c1y1 <= c2y2):
        y_d = 0
    else:
        y_d = min(abs(c1y2-c2y1), abs(c1y1-c2y2))
        
    return math.sqrt(x_d**2+y_d**2)

In [None]:
def make_distance_matrix(contents):
    
    N = len(contents)
    distance_matrix = np.zeros((N, N))
    
    for i in range(0, N):
        c1 = contents[i]
        for j in range(i+1, N):
            c2 = contents[j]
            if c1['type'] != c2['type']:
                # make Image and Text as different cluster (long distance)
                # ToDo : Find Caption
                distance = 999
            else:
                distance = calculate_distance_of_two_box(c1['bbox'], c2['bbox'])
            distance_matrix[i][j] = distance
            distance_matrix[j][i] = distance

    return distance_matrix

In [None]:
def make_cluster(distance_matrix, threshold):
    
    N = len(distance_matrix)
    content_cluster_ids = np.zeros(N)
    current_making_cluster_id = 1
    
    for i in range(N):
        if content_cluster_ids[i] == 0:
            # 未割当コンテンツ
            apply_to_cluster(i, current_making_cluster_id, content_cluster_ids, distance_matrix, threshold)
            current_making_cluster_id += 1

    return content_cluster_ids


def apply_to_cluster(target_content_id, target_cluster_id, content_cluster_ids, distance_matrix, threshold):
    """ 深さ優先でクラスタリングしていく """
    
    if content_cluster_ids[target_content_id] != 0:
        # すでにクラスタに割り当てられていた対象コンテンツ
        return 0
    
    # 対象コンテンツをクラスタに割り当て
    content_cluster_ids[target_content_id] = target_cluster_id
    
    # 対象コンテンツの近傍コンテンツも同じクラスタに割り当て
    nums = 1
    for j in range(len(distance_matrix)):
        distance = distance_matrix[target_content_id][j]
        if distance < threshold:
            nums += apply_to_cluster(j, target_cluster_id, content_cluster_ids, distance_matrix, threshold)
    
    # そのクラスタに属するコンテンツ数を返す
    return nums

In [None]:
def make_full_text(contents):
    full_text = ""
    for content in contents:
        if content['type'] == 'text':
            full_text += content['text']
    return full_text
    
def make_entire_bbox(contents):
    min_x = 99999
    max_x = -99999
    min_y = 99999
    max_y = -99999
    
    for content in contents:
        bbox = content['bbox']
        min_x = min(min_x, bbox['x1'], bbox['x2'])
        max_x = max(max_x, bbox['x1'], bbox['x2'])
        min_y = min(min_y, bbox['y1'], bbox['y2'])
        max_y = max(max_y, bbox['y1'], bbox['y2'])
        
    return {
        "x1": min_x,
        "x2": max_x,
        "y1": min_y,
        "y2": max_y
    }

In [None]:
def convert_page_to_cluster(page):
    contents = page['contents']
    distance_matrix = make_distance_matrix(page['contents'])
    cluster_ids = make_cluster(distance_matrix, threshold=5)
        
    clusters = {}
    di
    for i, content in enumerate(contents):
        cluster_id = cluster_ids[i]
        content['cluster_id'] = cluster_id
        
        if cluster_id not in clusters:
            clusters[cluster_id] = {
                "contents": [ content, ]
            }
        else:
            clusters[cluster_id]['contents'].append(content)
            
    for cid, cluster in clusters.items():
        cluster_contents = cluster['contents']
        cluster['full_text'] = make_full_text(cluster_contents)
        cluster['bbox'] = make_entire_bbox(cluster_contents)
        cluster['type'] = 'cluster'
        
    return clusters

In [None]:
import random

def generate_random_color():
    return "#"+''.join([random.choice('0123456789ABCDEF') for i in range(6)])

colors = [ generate_random_color() for i in range(100)]

In [None]:
def plot_page(page, page_image, plot_all_contents=False):
    
    image_w, image_h = page_image.size
    pdf_w = page['bbox']['x2']- page['bbox']['x1']
    pdf_h = page['bbox']['y2']- page['bbox']['y1']
    
    w_scale = image_w/pdf_w
    h_scale = image_h/pdf_h
    
    # Plot Page
    fig = plt.figure()
    ax = plt.axes()
    
    
    if plot_all_contents:
        target = "contents"
    else:
        target = "clusters"

    for content_num, content in enumerate(page[target]):

        bbox = content['bbox']
        content_w = (bbox['x2'] - bbox['x1'])*w_scale
        content_h = (bbox['y2'] - bbox['y1'])*h_scale

        if content['type'] == 'image':
            xy  = (bbox['x1']*w_scale, bbox['y1']*h_scale)
            r = patches.Rectangle(xy=xy, width=content_w,height=content_h, ec='#00FF00', fill=False, linestyle='solid', linewidth = 0.2)
            ax.add_patch(r)
        elif content['type'] == 'text':
            xy  = (bbox['x1']*w_scale, bbox['y1']*h_scale)
            r = patches.Rectangle(xy=xy, width=content_w,height=content_h, ec='#FF0000', fill=False, linestyle='solid', linewidth = 0.1)
            ax.add_patch(r)
        elif content['type'] == 'cluster':
            xy  = (bbox['x1']*w_scale, bbox['y1']*h_scale)
            c = colors[content_num]
            r = patches.Rectangle(xy=xy, width=content_w,height=content_h, ec=c, fill=False, linestyle='solid', linewidth = 0.5)
            ax.add_patch(r)
            
    ax.set_xlim(0,image_w)
    ax.set_ylim(0,image_h)

    ax.set_aspect('equal')
    
    plt.imshow(np.flipud(page_image))
    plt.show()

In [None]:
dm = make_distance_matrix(pages[2]['contents'])

In [None]:
make_cluster(dm, 10)

In [None]:
for i, content in enumerate(pages[2]['contents']):
    content['cluster_id'] =  cluster_ids[i]

In [None]:
pages[2]['clusters'][1]['full_text']

In [None]:
path = "sample.pdf"
pages  = extract_pdf(path)
pdf_images = convert_from_path(path)

for i in range(len(pages)):
    clusters = convert_page_to_cluster(pages[i])
    pages[i]['clusters'] = list(clusters.values())

In [None]:
page_id = 4
plot_page(pages[page_id], pdf_images[page_id], False)

In [None]:
for count, page in enumerate(pages):
    
    break
    print("Page {}".format(count+1))
    page_image = pdf_images[count]
    plot_page(page, page_image)
    break

In [None]:
distances = distance_matrix.flatten()

In [None]:
image_x1 = 80.047
image_x2 = 527.239
image_y1 = 531.945
image_y2 = 735.930

w = image_x2 - image_x1
h = image_y2 - image_y1

In [None]:
def plot_virtual_page(page):
    pdf_w = page['bbox']['x2']- page['bbox']['x1']
    pdf_h = page['bbox']['y2']- page['bbox']['y1']
    
    # Plot Page
    fig = plt.figure()
    ax = plt.axes()
    
    for content in page['contents']:
        bbox = content['bbox']
        content_w = (bbox['x2'] - bbox['x1'])
        content_h = (bbox['y2'] - bbox['y1'])
        
        if content['type'] == 'image':
            print(bbox)
            r = patches.Rectangle(xy=(bbox['x1'], bbox['y1']), width=content_w,height=content_h, ec='#FF0000', fill=False,  linestyle='solid', linewidth = 0.2)
            ax.add_patch(r)
        elif content['type'] == 'text':
            r = patches.Rectangle(xy=(bbox['x1'], bbox['y1']), width=content_w,height=content_h, ec='#000000', fill=False, linestyle='dashed',linewidth = 0.1)
            ax.add_patch(r)


    ax.set_xlim(0,pdf_w)
    ax.set_ylim(0,pdf_h)

    ax.set_aspect('equal')
    
    # plt.imshow(np.flipud(page_image))
    # plt.gca().invert_yaxis()

    plt.show()

In [None]:
plot_virtual_page(pages[0])