In [78]:
import cv2
import numpy as np
import os
import pytesseract

from ditod import add_vit_config

import torch

from detectron2.config import get_cfg
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultPredictor

In [79]:
config = "publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml"
opts = ['MODEL.WEIGHTS', 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_mrcnn.pth']
image = "5.jpg"

In [80]:
def predict(image):
    # Step 1: instantiate config
    cfg = get_cfg()
    add_vit_config(cfg)
    cfg.merge_from_file(config)

    # Step 2: add model weights URL to config
    cfg.merge_from_list(opts)

    # Step 3: set device
    device = "cpu"
    cfg.MODEL.DEVICE = device

    # Step 4: define model
    predictor = DefaultPredictor(cfg)
    # Step 5: run inference
    img = cv2.imread(image)

    md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
    if cfg.DATASETS.TEST[0]=='icdar2019_test':
        md.set(thing_classes=["table"])
    else:
        md.set(thing_classes=["text","title","list","table","figure"])
        
    output = predictor(img)["instances"]
        
    v = Visualizer(img[:, :, ::-1],
                md,
                scale=1.0,
                instance_mode=ColorMode.SEGMENTATION)
    result = v.draw_instance_predictions(output.to("cpu"))
    result_image = result.get_image()[:, :, ::-1]
    
    return img, result_image, output.to("cpu")

In [81]:
img, result_img, output = predict(image)

  "See the documentation of nn.Upsample for details.".format(mode)


In [82]:
cv2.imwrite("out.jpg", result_img)

True

In [83]:
print(output)

Instances(num_instances=8, image_height=3509, image_width=2481, fields=[pred_boxes: Boxes(tensor([[ 286.8920, 2091.7739, 2193.4136, 2320.2800],
        [ 286.3167, 1674.0605, 2136.2017, 1845.9060],
        [ 286.9091, 1329.9917, 2180.7046, 1441.1460],
        [ 287.8982, 1911.1667, 2205.5776, 2023.9351],
        [ 281.9621, 1505.9709, 2143.1118, 1609.3289],
        [ 284.6117, 1010.8362, 1459.6604, 1084.1025],
        [ 288.6501, 1149.3213, 1936.5421, 1263.5762],
        [ 282.0299, 1153.4529, 1937.6528, 1201.0176]])), scores: tensor([0.9989, 0.9982, 0.9971, 0.9968, 0.9949, 0.9893, 0.9619, 0.0613]), pred_classes: tensor([0, 0, 0, 0, 0, 1, 0, 0]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, 

In [84]:
def sort_index(instance):
    bbox = instance.pred_boxes.tensor
    sorting = sorted(range(bbox.size()[0]), key=lambda k: bbox[k][1].numpy())
    return sorting


In [85]:
def sort_tensor(tensor, sort_mask):
    return tensor[sort_mask]

In [86]:
def sort(instance):
    sort_mask = sort_index(instance)
    # sort pred_boxes
    instance.pred_boxes.tensor = sort_tensor(instance.pred_boxes.tensor, sort_mask)
    # sort score
    instance.scores = sort_tensor(instance.scores, sort_mask)
    # sort pred_classes
    instance.pred_classes = sort_tensor(instance.pred_classes, sort_mask)
    # sort pred_masks
    instance.pred_masks = sort_tensor(instance.pred_masks, sort_mask)
    return instance

In [87]:
output = sort(output)

boxes = output.to("cpu").pred_boxes if output.to("cpu").has("pred_boxes") else None
scores = output.to("cpu").scores if output.to("cpu").has("scores") else None
classes = output.to("cpu").pred_classes.tolist() if output.to("cpu").has("pred_classes") else None
class_list = ["text","title","list","table","figure"]

print(boxes)
print(scores)
print(classes)

Boxes(tensor([[ 284.6117, 1010.8362, 1459.6604, 1084.1025],
        [ 288.6501, 1149.3213, 1936.5421, 1263.5762],
        [ 282.0299, 1153.4529, 1937.6528, 1201.0176],
        [ 286.9091, 1329.9917, 2180.7046, 1441.1460],
        [ 281.9621, 1505.9709, 2143.1118, 1609.3289],
        [ 286.3167, 1674.0605, 2136.2017, 1845.9060],
        [ 287.8982, 1911.1667, 2205.5776, 2023.9351],
        [ 286.8920, 2091.7739, 2193.4136, 2320.2800]]))
tensor([0.9893, 0.9619, 0.0613, 0.9971, 0.9949, 0.9982, 0.9968, 0.9989])
[1, 0, 0, 0, 0, 0, 0, 0]


In [88]:
def get_remove_mask(instance, conf):
    scores = instance.scores
    out_mask = []
    for idx,score in enumerate(scores):
        if score >= conf:
            out_mask.append(idx)
    return out_mask

def filter_tensor(tensor, mask):
    return tensor[mask]

def remove_box_lower_than(instance, conf):
    mask = get_remove_mask(instance, conf)
    # sort pred_boxes
    instance.pred_boxes.tensor = filter_tensor(instance.pred_boxes.tensor, mask)
    # sort score
    instance.scores = filter_tensor(instance.scores, mask)
    # sort pred_classes
    instance.pred_classes = filter_tensor(instance.pred_classes, mask)
    # sort pred_masks
    instance.pred_masks = filter_tensor(instance.pred_masks, mask)

    return instance


In [89]:
def add_padding(input_img, padding, color):
    old_image_height, old_image_width, channels = input_img.shape

    # create new image of desired size and color (blue) for padding
    new_image_width = old_image_width + 2 * padding
    new_image_height = old_image_height + 2 * padding
    result = np.full((new_image_height,new_image_width, channels), color, dtype=np.uint8)

    # compute center offset
    x_center = (new_image_width - old_image_width) // 2
    y_center = (new_image_height - old_image_height) // 2

    # copy img image into center of result image
    result[y_center:y_center+old_image_height,
           x_center:x_center+old_image_width] = input_img
    return result

def ocr(img_cv, idx):
    img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
    print({
        "class": class_list[classes[idx]],
        "text": pytesseract.image_to_string(img_rgb)
    })

def crop_and_save_image(idx, input_img, bbox, filename, padding=25, color=(255,255,255)):
    height, width, channels = input_img.shape
    x1 = int(bbox[0])
    x2 = int(bbox[2])
    y1 = int(bbox[1])
    y2 = int(bbox[3])
    cropped_image = input_img[y1:y2, x1:x2]
    padded_image = add_padding(cropped_image, padding, color)
    ocr(padded_image, idx)
    cv2.imwrite(filename, padded_image)

out_folder = "out"
try:
    os.mkdir(out_folder)
except:
    pass

img = cv2.imread(image)

remove_box_lower_than(output, 0.4)
for idx, box in enumerate(boxes):
    crop_and_save_image(idx, img, box, os.path.join(out_folder, str(idx) + ".jpg"), 25)

{'class': 'title', 'text': 'Clinical Study Synopsis for Public Disclosure\n'}
{'class': 'text', 'text': 'This clinical study synopsis is provided in line with Boehringer Ingelheim’s Policy on\nTransparency and Publication of Clinical Study Data.\n'}
{'class': 'text', 'text': 'This clinical study synopsis is provided in line with Boehringer Ingelheim’s Policy on\n'}
{'class': 'text', 'text': 'The synopsis - which is part of the clinical study report - had been prepared in accordance with\nbest practice and applicable legal and regulatory requirements at the time of study completion.\n'}
{'class': 'text', 'text': 'The synopsis may include approved and non-approved uses, doses, formulations, treatment regimens\nand/or age groups; it has not necessarily been submitted to regulatory authorities.\n'}
{'class': 'text', 'text': 'A synopsis is not intended to provide a comprehensive analysis of all data currently available\nregarding a particular drug. More current information regarding a drug 