In [1]:
import cv2

from ditod import add_vit_config

import torch

from detectron2.config import get_cfg
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
config = "publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml"
opts = ['MODEL.WEIGHTS', 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_mrcnn.pth']
image = "img.png"

In [150]:
def predict(image):
    # Step 1: instantiate config
    cfg = get_cfg()
    add_vit_config(cfg)
    cfg.merge_from_file(config)

    # Step 2: add model weights URL to config
    cfg.merge_from_list(opts)

    # Step 3: set device
    device = "cpu"
    cfg.MODEL.DEVICE = device

    # Step 4: define model
    predictor = DefaultPredictor(cfg)
    # Step 5: run inference
    img = cv2.imread(image)

    md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
    if cfg.DATASETS.TEST[0]=='icdar2019_test':
        md.set(thing_classes=["table"])
    else:
        md.set(thing_classes=["text","title","list","table","figure"])
        
    output = predictor(img)["instances"]
        
    v = Visualizer(img[:, :, ::-1],
                md,
                scale=1.0,
                instance_mode=ColorMode.SEGMENTATION)
    result = v.draw_instance_predictions(output.to("cpu"))
    result_image = result.get_image()[:, :, ::-1]
    
    return img, result_image, output.to("cpu")

In [151]:
img, result_img, output = predict("test.jpeg")

  "See the documentation of nn.Upsample for details.".format(mode)


In [14]:
cv2.imwrite("out.jpg", result_img)

True

In [152]:
print(output)

Instances(num_instances=12, image_height=792, image_width=601, fields=[pred_boxes: Boxes(tensor([[308.5944, 387.7310, 548.3193, 549.9556],
        [ 50.5924, 636.3325, 290.5307, 742.6828],
        [ 50.7837, 488.1688, 290.5877, 639.7428],
        [308.5706, 545.3228, 548.6179, 743.0535],
        [ 50.4487, 339.1625, 290.8231, 444.3589],
        [308.0231, 316.4781, 549.6226, 353.0624],
        [ 50.5810, 316.2806, 291.1252, 341.4015],
        [ 50.5080,  71.7657, 549.5928,  94.8253],
        [ 51.0327, 442.5561, 290.5040, 490.1516],
        [308.2527, 367.4326, 379.5883, 380.3791],
        [ 51.0582, 101.4618, 549.3663, 278.4562],
        [ 51.7424, 279.6494, 133.8992, 289.6211]])), scores: tensor([1.0000, 0.9999, 0.9999, 0.9999, 0.9998, 0.9997, 0.9997, 0.9995, 0.9994,
        0.9993, 0.9990, 0.9979]), pred_classes: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, Fals

In [161]:
def sort_index(instance):
    bbox = instance.pred_boxes.tensor
    sorting = sorted(range(bbox.size()[0]), key=lambda k: bbox[k][1].numpy())
    return sorting


In [181]:
def sort_tensor(tensor, sort_mask):
    return tensor[sort_mask]

In [189]:
def sort(instance):
    sort_mask = sort_index(instance)
    # sort pred_boxes
    instance.pred_boxes.tensor = sort_tensor(instance.pred_boxes.tensor, sort_mask)
    # sort score
    instance.scores = sort_tensor(instance.scores, sort_mask)
    # sort pred_classes
    instance.pred_classes = sort_tensor(instance.pred_classes, sort_mask)
    # sort pred_masks
    instance.pred_masks = sort_tensor(instance.pred_masks, sort_mask)
    return instance

In [190]:
print(sort(output))

Instances(num_instances=12, image_height=792, image_width=601, fields=[pred_boxes: Boxes(tensor([[ 50.5080,  71.7657, 549.5928,  94.8253],
        [ 51.0582, 101.4618, 549.3663, 278.4562],
        [ 51.7424, 279.6494, 133.8992, 289.6211],
        [ 50.5810, 316.2806, 291.1252, 341.4015],
        [308.0231, 316.4781, 549.6226, 353.0624],
        [ 50.4487, 339.1625, 290.8231, 444.3589],
        [308.2527, 367.4326, 379.5883, 380.3791],
        [308.5944, 387.7310, 548.3193, 549.9556],
        [ 51.0327, 442.5561, 290.5040, 490.1516],
        [ 50.7837, 488.1688, 290.5877, 639.7428],
        [308.5706, 545.3228, 548.6179, 743.0535],
        [ 50.5924, 636.3325, 290.5307, 742.6828]])), scores: tensor([0.9995, 0.9990, 0.9979, 0.9997, 0.9997, 0.9998, 0.9993, 1.0000, 0.9994,
        0.9999, 0.9999, 0.9999]), pred_classes: tensor([0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, Fals