In [6]:
import os
import json

def generate_file_info_with_annotations(data_dir, json_file, version=1):
    # Load the data from the JSON file
    with open(json_file, 'r') as f:
        data_dict = json.load(f)

    # Define paths for images and labels
    image_dir = os.path.join(data_dir, 'images')
    label_dir = os.path.join(data_dir, 'labels')

    # Create text files to store the information
    file_count_txt = os.path.join(data_dir, f"{data_dir}_file_count_v{version}.txt")
    file_list_txt = os.path.join(data_dir, f"{data_dir}_file_list_v{version}.txt")
    unmatched_files_txt = os.path.join(data_dir, f"{data_dir}_unmatched_files_v{version}.txt")

    with open(file_count_txt, 'w') as count_file, open(file_list_txt, 'w') as list_file, open(unmatched_files_txt, 'w') as unmatched_file:
        # Set of valid prefixes (keys from the JSON dictionary)
        valid_prefixes = set(data_dict.keys())

        # Collect all image and label files
        all_image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
        all_label_files = [f.replace('.jpg', '.txt') for f in all_image_files]

        matched_files = set()

        # Loop through the dictionary and process each prefix
        for case_type, case_info in data_dict.items():
            # Get all image files for the current prefix
            image_files = [f for f in all_image_files if f.startswith(case_type)]
            label_files = [f.replace('.jpg', '.txt') for f in image_files]

            # Ensure each image has a corresponding label
            valid_files = [(img, lbl) for img, lbl in zip(image_files, label_files)
                           if os.path.exists(os.path.join(label_dir, lbl))]

            # Sort files by name (ascending order)
            valid_files.sort()

            # Mark these files as matched
            matched_files.update(img for img, _ in valid_files)

            # Get ratio and description from the current entry
            ratio = case_info.get("ratio", "N/A")
            description = case_info.get("description", "No description available")

            # Count the files and write to the count file
            count_file.write(f"'{case_type}'/'{description}': {len(valid_files)} files\n")

            # Write the actual filenames and annotations to the list file
            list_file.write(f"# {description}\n")
            for img, lbl in valid_files:
                list_file.write(f"{img}, {lbl}\n")

        # Identify unmatched files
        unmatched_images = set(all_image_files) - matched_files
        unmatched_labels = [img.replace('.jpg', '.txt') for img in unmatched_images]

        # Write unmatched files to a separate file
        unmatched_file.write("Unmatched files (those that don't match the keys):\n")
        for img, lbl in zip(unmatched_images, unmatched_labels):
            unmatched_file.write(f"{img}, {lbl}\n")

    print(f"Processing complete. Check '{file_count_txt}', '{file_list_txt}', and '{unmatched_files_txt}' for details.")

In [12]:
data_dir = '/Users/jjookim/Projects/AIForce/datasets/all_data/train_v1'
json_file = '/Users/jjookim/Projects/AIForce/datasets/jsons/all_data.json'  # Your single JSON file

generate_file_info_with_annotations(data_dir, json_file, version=1)

Processing complete. Check '/Users/jjookim/Projects/AIForce/datasets/all_data/train_v1_file_count_v1.txt', '/Users/jjookim/Projects/AIForce/datasets/all_data/train_v1_file_list_v1.txt', and '/Users/jjookim/Projects/AIForce/datasets/all_data/train_v1_unmatched_files_v1.txt' for details.
