In [1]:
import cv2
import numpy as np
import os
import pytesseract

from ditod import add_vit_config

import torch

from detectron2.config import get_cfg
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = "publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml"
opts = ['MODEL.WEIGHTS', 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_mrcnn.pth']
image = "6.jpg"

In [3]:
def predict(image):
    # Step 1: instantiate config
    cfg = get_cfg()
    add_vit_config(cfg)
    cfg.merge_from_file(config)

    # Step 2: add model weights URL to config
    cfg.merge_from_list(opts)

    # Step 3: set device
    device = "cpu"
    cfg.MODEL.DEVICE = device

    # Step 4: define model
    predictor = DefaultPredictor(cfg)

    # Step 5: run inference
    image = cv2.imread(image)

    # Scale Up
    scale = 10
    height, width, _ = image.shape
    img = cv2.resize(image, (width * scale, height * scale))

    md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
    if cfg.DATASETS.TEST[0]=='icdar2019_test':
        md.set(thing_classes=["table"])
    else:
        md.set(thing_classes=["text","title","list","table","figure"])
        
    output = predictor(img)["instances"]
        
    v = Visualizer(img[:, :, ::-1],
                md,
                scale=1.0,
                instance_mode=ColorMode.SEGMENTATION)
    result = v.draw_instance_predictions(output.to("cpu"))
    result_image = result.get_image()[:, :, ::-1]
    
    return img, result_image, output.to("cpu")

In [4]:
img, result_img, output = predict(image)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [5]:
cv2.imwrite("out.jpg", result_img)

True

In [6]:
def sort_index(instance):
    bbox = instance.pred_boxes.tensor
    sorting = sorted(range(bbox.size()[0]), key=lambda k: bbox[k][1].numpy())
    return sorting

In [7]:
def sort_tensor(tensor, sort_mask):
    return tensor[sort_mask]

In [11]:
def sort(instance):
    sort_mask = sort_index(instance)
    # sort pred_boxes
    instance.pred_boxes.tensor = sort_tensor(instance.pred_boxes.tensor, sort_mask)
    # sort score
    instance.scores = sort_tensor(instance.scores, sort_mask)
    # sort pred_classes
    instance.pred_classes = sort_tensor(instance.pred_classes, sort_mask)
    # sort pred_masks
    instance.pred_masks = sort_tensor(instance.pred_masks, sort_mask)
    return instance

def get_remove_mask(instance, conf):
    scores = instance.scores
    out_mask = []
    for idx,score in enumerate(scores):
        if score >= conf:
            out_mask.append(idx)
    return out_mask

def filter_tensor(tensor, mask):
    return tensor[mask]

def remove_box_lower_than(instance, conf):
    mask = get_remove_mask(instance, conf)
    # sort pred_boxes
    instance.pred_boxes.tensor = filter_tensor(instance.pred_boxes.tensor, mask)
    # sort score
    instance.scores = filter_tensor(instance.scores, mask)
    # sort pred_classes
    instance.pred_classes = filter_tensor(instance.pred_classes, mask)
    # sort pred_masks
    instance.pred_masks = filter_tensor(instance.pred_masks, mask)

    return instance


def get_output_details(output):
    boxes = output.to("cpu").pred_boxes if output.to("cpu").has("pred_boxes") else None
    scores = output.to("cpu").scores if output.to("cpu").has("scores") else None
    classes = output.to("cpu").pred_classes.tolist() if output.to("cpu").has("pred_classes") else None
    class_list = ["text","title","list","table","figure"]

    return boxes, scores, classes, class_list

def add_padding(input_img, padding, color):
    old_image_height, old_image_width, channels = input_img.shape

    # create new image of desired size and color (blue) for padding
    new_image_width = old_image_width + 2 * padding
    new_image_height = old_image_height + 2 * padding
    result = np.full((new_image_height,new_image_width, channels), color, dtype=np.uint8)

    # compute center offset
    x_center = (new_image_width - old_image_width) // 2
    y_center = (new_image_height - old_image_height) // 2

    # copy img image into center of result image
    result[y_center:y_center+old_image_height,
           x_center:x_center+old_image_width] = input_img
    return result

def ocr(img_cv, idx):
    img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
    return {
        "class": class_list[classes[idx]],
        "text": pytesseract.image_to_string(img_rgb)
    }

def crop_and_save_image(idx, input_img, bbox, filename, padding=25, color=(255,255,255)):
    height, width, channels = input_img.shape
    x1 = int(bbox[0])
    x2 = int(bbox[2])
    y1 = int(bbox[1])
    y2 = int(bbox[3])
    cropped_image = input_img[y1:y2, x1:x2]
    padded_image = add_padding(cropped_image, padding, color)
    cv2.imwrite(filename, padded_image)
    return ocr(padded_image, idx)

def format_output(ocr_output):
    final = {}
    title_index = 0

    for idx, item in enumerate(ocr_output):
        if (item['class'] == 'title'):
            final[f"header{title_index}"] = {}
            final[f"header{title_index}"]["title"] = item["text"]
            final[f"header{title_index}"]["body"] = []
            title_index += 1
        else:
            if (title_index == 0):
                final[f"header{title_index}"] = {}
                final[f"header{title_index}"]["title"] = ""
                final[f"header{title_index}"]["body"] = []
                final[f"header{title_index}"]["body"].append(item["text"])
            else:
                final[f"header{(title_index - 1)}"]["body"].append(item["text"])
    return final

In [12]:
output = sort(output)
output = remove_box_lower_than(output, 0.85)
boxes, scores, classes, class_list = get_output_details(output)

# Output after sorting
print(boxes)
print(scores)
print(classes)

Boxes(tensor([[ 4715.8921,  4466.5879, 20241.9121,  6092.1831],
        [ 2989.6243,  7110.5308, 12918.1738,  7673.9058],
        [ 2956.3796,  9246.2803, 20137.2852, 10423.3623],
        [ 5198.6533, 12241.8535, 12731.7988, 13013.1387],
        [ 2957.2825, 13713.3086, 21753.9297, 17940.3535],
        [ 5172.8374, 19333.7344, 12917.9180, 20075.8242],
        [ 2946.6902, 20805.5840, 14178.8145, 21357.6465],
        [ 2823.2402, 21662.3164, 21554.4297, 23457.5723],
        [ 2916.7366, 23784.1602, 20674.7402, 24284.0488]]))
tensor([0.9772, 0.9589, 0.9855, 0.9694, 0.9960, 0.9176, 0.9522, 0.9518, 0.9887])
[1, 0, 0, 1, 0, 1, 0, 0, 0]


In [13]:
out_folder = "out"
try:
    os.mkdir(out_folder)
except:
    pass

ocr_output_list = []

for idx, box in enumerate(boxes):
    ocr_result = crop_and_save_image(idx, img, box, os.path.join(out_folder, str(idx) + ".jpg"), 25)
    ocr_output_list.append(ocr_result)

final_result = format_output(ocr_output_list)
print(final_result)

{'header0': {'title': 'A study to test whether different doses of BI 655064 help\npeople with active lupus nepnritis\n\n', 'body': ['This is a summary of results from 1 clinical study.\n', 'We thank all study participants. You helped us to answer important questions about\nB| 655064 and the treatment of lupus nephritis.\n']}, 'header1': {'title': 'What was this study about?\n', 'body': ['The purpose of this study was to find out whether a medicine called BI 655064 helps people\nwith lupus nephritis. Lupus nephritis is kidney inflammation caused by the autoimmune\ndisease lupus. The inflammation can be severe, leading to loss of kidney function. New\ntreatments are needed for this condition. Bl 655064 is a medicine that is being developed to\ntreat people with autoimmune disorders. When we develop a new medicine, we need to\nmake sure it works. We wanted to see if different doses of BI 655064 help improve kidney\nfunction in people with lupus nephritis.\n']}, 'header2': {'title': 'Who t