In [6]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

import json
from utils import get_table_grid

def process_jsonl(file_path, output_file_path):
    max_annotations_length = 0
    max_table_width = 0
    max_table_height = 0
    impurities = 0

    with open(file_path, 'r') as file, open(output_file_path, 'w') as output_file:
        for line in file:

            pure = True

            data = json.loads(line)
            annotations = data.get("annotations")
            html = data.get("html")
            html = ''.join(html)
            
            max_annotations_length = max(max_annotations_length, len(annotations))

            thead_grid, tbody_grid = get_table_grid(html)
            table_grid = thead_grid + tbody_grid
            max_table_width = max(max_table_width, len(table_grid[0]))
            max_table_height = max(max_table_height, len(table_grid))

            base_width = len(table_grid[0])
            for row in table_grid:
                if len(row) != base_width:
                    pure = False

            if pure:
                # Write the JSON object to the new JSONL file
                output_file.write(json.dumps(data) + '\n')
            else:
                impurities += 1
                
    print(f"max bbox padding: {max_annotations_length}")
    print(f"max table width padding: {max_table_width}")
    print(f"max table height padding: {max_table_height}")
    print(f"deformity count: {impurities}")

file_path = r'C:\Users\tangy\Downloads\DETR-GFTE\datasets\ptn_train.jsonl'
output_file_path = r'C:\Users\tangy\Downloads\DETR-GFTE\datasets\pure_ptn_train.jsonl'

html_lists = process_jsonl(file_path, output_file_path)


max bbox padding: 2061
max table width padding: 80
max table height padding: 89
deformity count: 2017


In [3]:
import json
import os
from PIL import Image

def add_image_size_to_jsonl(input_jsonl_path, images_dir, output_jsonl_path):
    # Open input and output JSONL files
    with open(input_jsonl_path, 'r') as infile, open(output_jsonl_path, 'w') as outfile:
        for line in infile:
            try:
                # Parse the JSON line
                json_obj = json.loads(line)
                
                # Get the filename from the JSON object
                filename = json_obj.get('filename')
                
                if filename:
                    # Construct the full path to the image file
                    image_path = os.path.join(images_dir, filename)
                    
                    # Open the image and get its dimensions
                    with Image.open(image_path) as img:
                        width, height = img.size
                        # Add the new field with the image dimensions
                        json_obj['orig_size'] = {'width': width, 'height': height}
                
                # Write the updated JSON object to the new file
                outfile.write(json.dumps(json_obj) + '\n')
            
            except Exception as e:
                # Handle any errors (e.g., missing files, corrupt images)
                print(f"Error processing line: {e}")

# Example usage
input_jsonl_path = r'C:\Users\tangy\Downloads\DETR-GFTE\datasets\gnet_val.jsonl'   # Path to the input .jsonl file
images_dir = r'C:\Users\tangy\Downloads\DETR-GFTE\datasets\unified_val'         # Directory containing the images
output_jsonl_path = r'C:\Users\tangy\Downloads\DETR-GFTE\datasets\gnet_val1.jsonl' # Path to the output .jsonl file

add_image_size_to_jsonl(input_jsonl_path, images_dir, output_jsonl_path)
