convert pubtabnet annotation format to COCO-like format for the model

In [2]:
import json

def filter_and_process_jsonl(input_path, output_path):
    with open(input_path, 'r', encoding="utf8") as infile, open(output_path, 'w', encoding="utf8") as outfile:
        for line in infile:
            # Parse the JSON line
            data = json.loads(line)
            
            # Check if split == "train"
            if data["split"] == "train":
                bbox = []
                for cell in data['html']['cells']:
                    if 'bbox' in cell:
                        bbox.append(cell['bbox'])
                    else:
                        # for empty cells, we define the bbox to be as small and out of the way as possible
                        # such as to be automatically voided as candidate gt bboxes when calculating iou scores
                        bbox.append([0,0,0.01,0.01])

                # in this case we do not need bbox classification, so there is only 1 class
                category_id = [0] * len(bbox)

                annotations = []
                for b, c in zip(bbox, category_id):
                    xmin, ymin, xmax, ymax = b
                    width = xmax - xmin
                    height = ymax - ymin
                    area = width * height
                    annotation = {
                        'bbox': [xmin, ymin, width, height],
                        'category_id': c,
                        'area': area
                    }
                    annotations.append(annotation)

                output = {
                    'filename': data['filename'],
                    'image_id': data['imgid'],
                    'html': data['html']['structure']['tokens'],
                    'annotations': annotations
                }

                # Write the output JSON object to the file with a newline character
                outfile.write(json.dumps(output) + '\n')

input_jsonl_path = r"C:\Users\tangy\Downloads\DETR-GFTE\datasets\ptn_examples_val\PubTabNet_Examples-val.jsonl"
output_jsonl_path = r"C:\Users\tangy\Downloads\DETR-GFTE\datasets\ptn_examples_val\ptn_examples_val.jsonl"

filter_and_process_jsonl(input_jsonl_path, output_jsonl_path)