In [32]:
import cv2

from ditod import add_vit_config

import torch

from detectron2.config import get_cfg
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultPredictor

In [33]:
config = "publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml"
opts = ['MODEL.WEIGHTS', 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_mrcnn.pth']
image = "img.png"

In [44]:
global_md = None
def predict(image):
    # Step 1: instantiate config
    cfg = get_cfg()
    add_vit_config(cfg)
    cfg.merge_from_file(config)

    # Step 2: add model weights URL to config
    cfg.merge_from_list(opts)

    # Step 3: set device
    device = "cpu"
    cfg.MODEL.DEVICE = device

    # Step 4: define model
    predictor = DefaultPredictor(cfg)
    # Step 5: run inference
    img = cv2.imread(image)

    md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
    if cfg.DATASETS.TEST[0]=='icdar2019_test':
        md.set(thing_classes=["table"])
    else:
        md.set(thing_classes=["text","title","list","table","figure"])

    output = predictor(img)["instances"]
        
    v = Visualizer(img[:, :, ::-1],
                md,
                scale=1.0,
                instance_mode=ColorMode.SEGMENTATION)
    result = v.draw_instance_predictions(output.to("cpu"))
    result_image = result.get_image()[:, :, ::-1]
    
    return img, result_image, output.to("cpu"), md

In [45]:
img, result_img, output, global_md = predict("5.jpg")

In [46]:
cv2.imwrite("out.jpg", result_img)

True

In [47]:
print(output)

Instances(num_instances=8, image_height=3509, image_width=2481, fields=[pred_boxes: Boxes(tensor([[ 286.8920, 2091.7739, 2193.4136, 2320.2800],
        [ 286.3167, 1674.0605, 2136.2017, 1845.9060],
        [ 286.9091, 1329.9917, 2180.7046, 1441.1460],
        [ 287.8982, 1911.1667, 2205.5776, 2023.9351],
        [ 281.9621, 1505.9709, 2143.1118, 1609.3289],
        [ 284.6117, 1010.8362, 1459.6604, 1084.1025],
        [ 288.6501, 1149.3213, 1936.5421, 1263.5762],
        [ 282.0299, 1153.4529, 1937.6528, 1201.0176]])), scores: tensor([0.9989, 0.9982, 0.9971, 0.9968, 0.9949, 0.9893, 0.9619, 0.0613]), pred_classes: tensor([0, 0, 0, 0, 0, 1, 0, 0]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, 

In [48]:
def sort_index(instance):
    bbox = instance.pred_boxes.tensor
    sorting = sorted(range(bbox.size()[0]), key=lambda k: bbox[k][1].numpy())
    return sorting


In [49]:
def sort_tensor(tensor, sort_mask):
    return tensor[sort_mask]

In [50]:
def sort(instance):
    sort_mask = sort_index(instance)
    # sort pred_boxes
    instance.pred_boxes.tensor = sort_tensor(instance.pred_boxes.tensor, sort_mask)
    # sort score
    instance.scores = sort_tensor(instance.scores, sort_mask)
    # sort pred_classes
    instance.pred_classes = sort_tensor(instance.pred_classes, sort_mask)
    # sort pred_masks
    instance.pred_masks = sort_tensor(instance.pred_masks, sort_mask)
    return instance

In [51]:
print(sort(output))

Instances(num_instances=8, image_height=3509, image_width=2481, fields=[pred_boxes: Boxes(tensor([[ 284.6117, 1010.8362, 1459.6604, 1084.1025],
        [ 288.6501, 1149.3213, 1936.5421, 1263.5762],
        [ 282.0299, 1153.4529, 1937.6528, 1201.0176],
        [ 286.9091, 1329.9917, 2180.7046, 1441.1460],
        [ 281.9621, 1505.9709, 2143.1118, 1609.3289],
        [ 286.3167, 1674.0605, 2136.2017, 1845.9060],
        [ 287.8982, 1911.1667, 2205.5776, 2023.9351],
        [ 286.8920, 2091.7739, 2193.4136, 2320.2800]])), scores: tensor([0.9893, 0.9619, 0.0613, 0.9971, 0.9949, 0.9982, 0.9968, 0.9989]), pred_classes: tensor([1, 0, 0, 0, 0, 0, 0, 0]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, 

In [52]:
output[0].to("cpu")

Instances(num_instances=1, image_height=3509, image_width=2481, fields=[pred_boxes: Boxes(tensor([[ 284.6117, 1010.8362, 1459.6604, 1084.1025]])), scores: tensor([0.9893]), pred_classes: tensor([1]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]]])])

In [74]:
from detectron2.structures import Boxes, RotatedBoxes
import numpy as np

def _convert_boxes(boxes):
    """
    Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
    """
    if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
        return boxes.tensor.detach().numpy()
    else:
        return np.asarray(boxes)

In [78]:
boxes = output[0].to("cpu").pred_boxes if output[0].to("cpu").has("pred_boxes") else None
scores = output[0].to("cpu").scores if output[0].to("cpu").has("scores") else None
classes = output[0].to("cpu").pred_classes.tolist() if output[0].to("cpu").has("pred_classes") else None

In [79]:
x0, y0, x1, y1 = _convert_boxes(boxes)[0]