In [48]:
import cv2
import numpy as np
import os
import pytesseract

from ditod import add_vit_config

import torch

from detectron2.config import get_cfg
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultPredictor

In [38]:
config = "publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml"
opts = ['MODEL.WEIGHTS', 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_mrcnn.pth']
image = "8.jpg"

In [39]:
def predict(image):
    # Step 1: instantiate config
    cfg = get_cfg()
    add_vit_config(cfg)
    cfg.merge_from_file(config)

    # Step 2: add model weights URL to config
    cfg.merge_from_list(opts)

    # Step 3: set device
    device = "cpu"
    cfg.MODEL.DEVICE = device

    # Step 4: define model
    predictor = DefaultPredictor(cfg)
    # Step 5: run inference
    img = cv2.imread(image)

    md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
    if cfg.DATASETS.TEST[0]=='icdar2019_test':
        md.set(thing_classes=["table"])
    else:
        md.set(thing_classes=["text","title","list","table","figure"])
        
    output = predictor(img)["instances"]
        
    v = Visualizer(img[:, :, ::-1],
                md,
                scale=1.0,
                instance_mode=ColorMode.SEGMENTATION)
    result = v.draw_instance_predictions(output.to("cpu"))
    result_image = result.get_image()[:, :, ::-1]
    
    return img, result_image, output.to("cpu")

In [40]:
img, result_img, output = predict(image)

In [41]:
cv2.imwrite("out.jpg", result_img)

True

In [42]:
print(output)

Instances(num_instances=5, image_height=3508, image_width=2481, fields=[pred_boxes: Boxes(tensor([[ 376.8383,  771.7432, 2138.3201, 1209.8268],
        [ 293.9623,  625.3800, 2155.1917,  742.9721],
        [ 513.3419,  484.6616, 1727.9160,  560.6929],
        [ 281.0919,  703.2103, 1778.7739,  748.9332],
        [ 332.7248,  640.6341, 2148.9641,  686.6320]])), scores: tensor([0.9956, 0.9801, 0.9784, 0.3717, 0.2164]), pred_classes: tensor([2, 0, 1, 0, 0]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., Fal

In [59]:
boxes = output.to("cpu").pred_boxes if output.to("cpu").has("pred_boxes") else None
scores = output.to("cpu").scores if output.to("cpu").has("scores") else None
classes = output.to("cpu").pred_classes.tolist() if output.to("cpu").has("pred_classes") else None
class_list = ["text","title","list","table","figure"]

print(boxes)
print(scores)
print(classes)

Boxes(tensor([[ 376.8383,  771.7432, 2138.3201, 1209.8268],
        [ 293.9623,  625.3800, 2155.1917,  742.9721],
        [ 513.3419,  484.6616, 1727.9160,  560.6929]]))
tensor([0.9956, 0.9801, 0.9784])
[2, 0, 1]


In [43]:
def sort_index(instance):
    bbox = instance.pred_boxes.tensor
    sorting = sorted(range(bbox.size()[0]), key=lambda k: bbox[k][1].numpy())
    return sorting


In [44]:
def sort_tensor(tensor, sort_mask):
    return tensor[sort_mask]

In [45]:
def sort(instance):
    sort_mask = sort_index(instance)
    # sort pred_boxes
    instance.pred_boxes.tensor = sort_tensor(instance.pred_boxes.tensor, sort_mask)
    # sort score
    instance.scores = sort_tensor(instance.scores, sort_mask)
    # sort pred_classes
    instance.pred_classes = sort_tensor(instance.pred_classes, sort_mask)
    # sort pred_masks
    instance.pred_masks = sort_tensor(instance.pred_masks, sort_mask)
    return instance

In [46]:
def get_remove_mask(instance, conf):
    scores = instance.scores
    out_mask = []
    for idx,score in enumerate(scores):
        if score >= conf:
            out_mask.append(idx)
    return out_mask

def filter_tensor(tensor, mask):
    return tensor[mask]

def remove_box_lower_than(instance, conf):
    mask = get_remove_mask(instance, conf)
    # sort pred_boxes
    instance.pred_boxes.tensor = filter_tensor(instance.pred_boxes.tensor, mask)
    # sort score
    instance.scores = filter_tensor(instance.scores, mask)
    # sort pred_classes
    instance.pred_classes = filter_tensor(instance.pred_classes, mask)
    # sort pred_masks
    instance.pred_masks = filter_tensor(instance.pred_masks, mask)

    return instance


In [66]:
def add_padding(input_img, padding, color):
    old_image_height, old_image_width, channels = input_img.shape

    # create new image of desired size and color (blue) for padding
    new_image_width = old_image_width + 2 * padding
    new_image_height = old_image_height + 2 * padding
    result = np.full((new_image_height,new_image_width, channels), color, dtype=np.uint8)

    # compute center offset
    x_center = (new_image_width - old_image_width) // 2
    y_center = (new_image_height - old_image_height) // 2

    # copy img image into center of result image
    result[y_center:y_center+old_image_height,
           x_center:x_center+old_image_width] = input_img
    return result

def ocr(img_cv, idx):
    img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
    print({
        "class": class_list[classes[idx]],
        "text": pytesseract.image_to_string(img_rgb)
    })

def crop_and_save_image(idx, input_img, bbox, filename, padding=25, color=(255,255,255)):
    height, width, channels = input_img.shape
    x1 = int(bbox[0])
    x2 = int(bbox[2])
    y1 = int(bbox[1])
    y2 = int(bbox[3])
    cropped_image = input_img[y1:y2, x1:x2]
    padded_image = add_padding(cropped_image, padding, color)
    ocr(padded_image, idx)
    cv2.imwrite(filename, padded_image)

out_folder = "out"
try:
    os.mkdir(out_folder)
except:
    pass

img = cv2.imread(image)

remove_box_lower_than(output, 0.4)
for idx, box in enumerate(boxes):
    crop_and_save_image(idx, img, box, os.path.join(out_folder, str(idx) + ".jpg"), 25)

{'class': 'list', 'text': '10 out of 21 participants (48%) who took the low BI 655064 dose had unwanted\neffects.\n\n4 out of 20 participants (20%) who took the medium BI 655064 dose had unwanted\neffects.\n\n23 out of 40 participants (58%) who took the high BI 655064 dose had unwanted\neffects.\n\n21 out of 40 participants (53%) who took placebo had unwanted effects.\n'}
{'class': 'text', 'text': 'Yes, participants in all groups had unwanted effects. Unwanted effects are health problems\nthat the doctors think were caused by BI 655064 or placebo. In this study:\n'}
{'class': 'title', 'text': 'Did participants have any unwanted effects?\n'}
