convert pubtabnet annotation format to COCO-like format for the DETR model

In [2]:
import json
from utils import get_table_grid

def ptn2coco_multiclass(input_path, output_path):
    with open(input_path, 'r', encoding="utf8") as infile, open(output_path, 'w', encoding="utf8") as outfile:
        max_classes = 0
        for line in infile:
            # Parse the JSON line
            data = json.loads(line)
            
            # Check if split == "train"
            if data["split"] == "train":
                bboxes = []
                for cell in data['html']['cells']:
                    if 'bbox' in cell:
                        bboxes.append(cell['bbox'])
                    else:
                        # for empty cells, we give it a FALSY value
                        bboxes.append([])

                # to improve the respresentative power of the hidden states, we will be using classifying the bbox
                # into the number of connected neighbors
                html = get_table_grid(''.join(data['html']['structure']['tokens']))
                annotations = []
                for cell_i, bbox in enumerate(bboxes):
                    # convert to coco format
                    xmin, ymin, xmax, ymax = bbox
                    width = xmax - xmin
                    height = ymax - ymin
                    area = width * height

                    neighbors = []
                    # get number of connected neighbors
                    for row_i, row in enumerate(html):
                        for col_i, item in enumerate(row):
                            if item == cell_i:
                                neighbors.extend([
                                    html[max(row_i-1, 0)][col_i],
                                    html[row_i][min(col_i+1), len(html[0])],
                                    html[min(row_i+1, len(html))][col_i],
                                    html[row_i][max(col_i-1, 0)]
                                    ])
                    
                    # isolate unique neighbor indices and remove itself
                    neighbors = list(set(neighbors))
                    neighbors.remove(cell_i)
                    category_id = len(neighbors)

                    if category_id > max_classes: max_classes = category_id

                    annotation = {
                        'bbox': [xmin, ymin, width, height],
                        'category_id': category_id,
                        'area': area
                    }
                    annotations.append(annotation)

                output = {
                    'filename': data['filename'],
                    'image_id': data['imgid'],
                    'html': data['html']['structure']['tokens'],
                    'annotations': annotations
                }

                # Write the output JSON object to the file with a newline character
                outfile.write(json.dumps(output) + '\n')

            print(max_classes)

input_jsonl_path = r"C:\Users\tangy\Downloads\DETR-GFTE\datasets\ptn_examples_val\PubTabNet_Examples-val.jsonl"
output_jsonl_path = r"C:\Users\tangy\Downloads\DETR-GFTE\datasets\ptn_examples_val\ptn_examples_val.jsonl"

filter_and_process_jsonl(input_jsonl_path, output_jsonl_path)