In [10]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

import json
from utils import get_table_grid

def filter_and_process_jsonl(input_path, output_path, mode='train'):
    max_annotations_length = 0
    max_table_width = 0
    max_table_height = 0
    impurities = 0

    with open(input_path, 'r', encoding="utf8") as infile, open(output_path, 'w', encoding="utf8") as outfile:
        for line in infile:
            # Parse the JSON line
            try:
                data = json.loads(line)
            except:
                # print(line)
                # raise Exception('json.loads error')
                continue
            
            # Check if split == "train"
            if data["split"] == mode:
                bbox = []
                for cell in data['html']['cells']:
                    if cell['bbox'][4]==2:
                        bbox.append(cell['bbox'][:4])
                    else:
                        # for empty cells, we define the bbox to be as small and out of the way as possible
                        # such as to be automatically voided as candidate gt bboxes when calculating iou scores
                        bbox.append([0,0,0.01,0.01])

                # in this case we do not need bbox classification, so there is only 1 class
                category_id = [0] * len(bbox)

                annotations = []
                for b, c in zip(bbox, category_id):
                    xmin, ymin, xmax, ymax = b
                    width = xmax - xmin
                    height = ymax - ymin
                    area = width * height
                    annotation = {
                        'bbox': [xmin, ymin, width, height],
                        'category_id': c,
                        'area': area
                    }
                    annotations.append(annotation)

                # image_id is actually not necessary for training and stn doesnt have it
                output = {
                    'filename': data['filename'],
                    'image_id': 0,
                    'html': data['html']['structure']['tokens'],
                    'annotations': annotations
                }

                pure = True

                html = data['html']['structure']['tokens']
                html = ''.join(html)
                
                max_annotations_length = max(max_annotations_length, len(annotations))

                thead_grid, tbody_grid = get_table_grid(html)
                table_grid = thead_grid + tbody_grid
                max_table_width = max(max_table_width, len(table_grid[0]))
                max_table_height = max(max_table_height, len(table_grid))

                base_width = len(table_grid[0])
                for row in table_grid:
                    if len(row) != base_width:
                        pure = False

                if pure:
                    # Write the JSON object to the new JSONL file
                    outfile.write(json.dumps(output) + '\n')
                else:
                    impurities += 1

    print(f"max bbox padding: {max_annotations_length}")
    print(f"max table width padding: {max_table_width}")
    print(f"max table height padding: {max_table_height}")
    print(f"deformity count: {impurities}")

input_jsonl_path = r"C:\Users\tangy\Downloads\fintabnet\fintabnet\synthetic_data.jsonl"
output_jsonl_path = r"C:\Users\tangy\Downloads\DETR-GFTE\datasets\stn_fintabnet_val.jsonl"

filter_and_process_jsonl(input_jsonl_path, output_jsonl_path, mode='val')

max bbox padding: 236
max table width padding: 15
max table height padding: 25
deformity count: 0
