convert pubtabnet annotation format to COCO-like format for the DETR model

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

import json
from utils import get_table_grid

def filter_and_process_jsonl(input_path, output_path, split):
    with open(input_path, 'r', encoding="utf8") as infile, open(output_path, 'w', encoding="utf8") as outfile:
        max_classes = 0
        for line in infile:
            abort = False

            # Parse the JSON line
            data = json.loads(line)
            
            # Check if split == "train"
            if data["split"] == split:
                bboxes = []
                for cell in data['html']['cells']:
                    if 'bbox' in cell:
                        bboxes.append(cell['bbox'])
                    else:
                        # for empty cells, we give it a FALSY value
                        bboxes.append([])

                # to improve the respresentative power of the hidden states, we will be using classifying the bbox
                # into the number of connected neighbors
                thead_grid, tbody_grid = get_table_grid(''.join(data['html']['structure']['tokens']))
                table_grid = thead_grid + tbody_grid
                for row in table_grid:
                    if len(row) != len(table_grid[0]):
                        abort = True
                if abort: continue

                annotations = []
                for cell_i, bbox in enumerate(bboxes):
                    if bbox:
                        # convert to coco format
                        xmin, ymin, xmax, ymax = bbox
                        width = xmax - xmin
                        height = ymax - ymin
                        area = width * height

                        neighbors = []
                        # get number of connected neighbors
                        for row_i, row in enumerate(table_grid):
                            for col_i, item in enumerate(row):
                                if item == cell_i:
                                    neighbors.extend([
                                        table_grid[max(row_i-1, 0)][col_i],
                                        table_grid[row_i][min(col_i+1, len(table_grid[0])-1)],
                                        table_grid[min(row_i+1, len(table_grid)-1)][col_i],
                                        table_grid[row_i][max(col_i-1, 0)]
                                        ])
                        
                        # isolate unique neighbor indices and remove itself
                        neighbors = set(neighbors)
                        neighbors.discard(cell_i)
                        category_id = len(neighbors)


                        if category_id > max_classes: max_classes = category_id

                        annotation = {
                            'bbox': [xmin, ymin, width, height],
                            'category_id': category_id,
                            'area': area
                        }
                        annotations.append(annotation)

                output = {
                    'filename': data['filename'],
                    'image_id': data['imgid'],
                    'html': data['html']['structure']['tokens'],
                    'annotations': annotations
                }

                # Write the output JSON object to the file with a newline character
                outfile.write(json.dumps(output) + '\n')

        # max is 64 for pubtabnet
        print(max_classes)

input_jsonl_path = r"C:\Users\remote desktop\Downloads\pubtabnet\PubTabNet_2.0.0.jsonl"
output_jsonl_path = r"C:\Users\remote desktop\Downloads\DETR-GFTE\datasets\detr\stage3\val.jsonl"

filter_and_process_jsonl(input_jsonl_path, output_jsonl_path, split="val")

42
