In [1]:
import os
import json

# Define directories
DATA_DIR = 'Your data dir'
OUTPUT_DIR = 'Your output dir'

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Get all json files in the data directory
json_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.json')]
print(f"Found {len(json_files)} JSON files to process")

# Process each file
for file_name in json_files:
    input_path = os.path.join(DATA_DIR, file_name)
    output_path = os.path.join(OUTPUT_DIR, file_name)
    
    # Read the json file
    with open(input_path, 'r') as f:
        data = json.load(f)
    
    # Filter out items with type 'text'
    if isinstance(data, list):
        filtered_data = [item for item in data if item.get('type') != 'text']
        original_count = len(data)
        filtered_count = len(filtered_data)
    elif isinstance(data, dict):
        # If data is a dictionary, check if it has a list field
        original_count = 0
        filtered_count = 0
        filtered_data = data.copy()  # Create a copy to avoid modifying the original during iteration
        
        for key, value in data.items():
            if isinstance(value, list):
                original_count += len(value)
                filtered_data[key] = [item for item in value if item.get('type') != 'text']
                filtered_count += len(filtered_data[key])
    else:
        filtered_data = data  # Keep as is if not a list or dict
        original_count = "N/A"
        filtered_count = "N/A"
    
    # Save the filtered data to the output directory
    with open(output_path, 'w') as f:
        json.dump(filtered_data, f, indent=2)
    
    print(f"Processed {file_name}: Original items: {original_count}, Filtered items: {filtered_count}")

print(f"All files processed. Filtered data saved to {OUTPUT_DIR}")

Found 2903 JSON files to process
Processed step_15_20250115@172719.json: Original items: 125, Filtered items: 25
Processed step_24_20250218@024316.json: Original items: 188, Filtered items: 152
Processed step_6_20241210@210031.json: Original items: 133, Filtered items: 119
Processed step_11_20241212@181334.json: Original items: 236, Filtered items: 195
Processed step_5_20250115@002632.json: Original items: 79, Filtered items: 58
Processed step_7_20241212@031032.json: Original items: 145, Filtered items: 122
Processed step_12_20241112@154041.json: Original items: 173, Filtered items: 143
Processed step_6_20241210@230030.json: Original items: 169, Filtered items: 124
Processed step_1_20250228@130408.json: Original items: 141, Filtered items: 103
Processed step_5_20250114@023013.json: Original items: 184, Filtered items: 150
Processed step_7_20250324@110435.json: Original items: 50, Filtered items: 42
Processed step_1_20241224@204433.json: Original items: 224, Filtered items: 122
Processe

In [10]:
# extract all obj from the json file to form a jsonl
import os
import json
import uuid

# Define directories
INPUT_DIR = OUTPUT_DIR  # Use the filtered data as input
JSONL_OUTPUT_PATH = 'Your jsonl file'

# Function to convert bbox to xywh format
def bbox_to_xywh(bbox):
    # Assuming bbox is [x1, y1, x2, y2]
    x = bbox[0]
    y = bbox[1]
    w = bbox[2] - bbox[0]
    h = bbox[3] - bbox[1]
    return [x, y, w, h]

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(JSONL_OUTPUT_PATH), exist_ok=True)

# Initialize counter
total_objects = 0

# Process all files and write to a single JSONL file
with open(JSONL_OUTPUT_PATH, 'w') as out_file:
    
    # Function to process objects
    def process_objects(obj_list, image_name):
        global total_objects
        file_objects = 0
        
        for obj in obj_list:
            if 'bbox' in obj:  # Check if object has bbox
                # Extract required information with generated UUID
                extracted_obj = {
                    'uuid': str(uuid.uuid4()),  # Generate a new UUID
                    'original_image': image_name,
                    'bbox': bbox_to_xywh(obj['bbox']),
                    'label': obj.get('content', '')  # Use content as label
                }
                
                # Write to jsonl file
                out_file.write(json.dumps(extracted_obj) + '\n')
                total_objects += 1
                file_objects += 1
        
        return file_objects
    
    for file_name in os.listdir(INPUT_DIR):
        if not file_name.endswith('.json'):
            continue
            
        input_path = os.path.join(INPUT_DIR, file_name)
        #image_name = file_name + '.png'  # Create image name from JSON filename
        image_name = file_name.split('.')[0] + '.png'

        # Read the json file
        with open(input_path, 'r') as f:
            data = json.load(f)
        
        # Handle different data structures
        objects_in_file = 0
        if isinstance(data, list):
            objects_in_file = process_objects(data, image_name)
        elif isinstance(data, dict):
            for key, value in data.items():
                if isinstance(value, list):
                    objects_in_file += process_objects(value, image_name)
        
        print(f"Processed {file_name}: Extracted {objects_in_file} objects")

print(f"All files processed. Total {total_objects} objects saved to {JSONL_OUTPUT_PATH}")

Processed step_14_20241210@160512.json: Extracted 70 objects
Processed step_50_20250124@025918.json: Extracted 195 objects
Processed step_11_20250209@044728.json: Extracted 110 objects
Processed step_15_20241210@195519.json: Extracted 133 objects
Processed step_6_20241210@180030.json: Extracted 150 objects
Processed step_8_20250215@001218.json: Extracted 18 objects
Processed step_3_20250213@205522.json: Extracted 39 objects
Processed step_13_20241201@083901.json: Extracted 39 objects
Processed step_1_20241212@204718.json: Extracted 90 objects
Processed step_2_20250216@224500.json: Extracted 149 objects
Processed step_6_20250209@040834.json: Extracted 96 objects
Processed step_6_20241210@203923.json: Extracted 137 objects
Processed step_11_20241211@082718.json: Extracted 31 objects
Processed step_11_20241112@014355.json: Extracted 146 objects
Processed step_2_20241210@194721.json: Extracted 138 objects
Processed step_5_20241209@154629.json: Extracted 58 objects
Processed step_7_20241112