In [24]:
import json
from collections import defaultdict

def analyze_coco_dataset(ann_file):
    # Load COCO annotations
    with open(ann_file, "r") as f:
        coco = json.load(f)

    # Build category mapping
    categories = {cat["id"]: cat["name"] for cat in coco["categories"]}

    # Track images per class
    class_to_images = defaultdict(set)  # use set to avoid double counting
    for ann in coco["annotations"]:
        class_id = ann["category_id"]
        image_id = ann["image_id"]
        class_to_images[class_id].add(image_id)

    # Print summary
    print("Categories in dataset:")
    for class_id, class_name in categories.items():
        num_images = len(class_to_images[class_id])
        print(f"ID: {class_id}, Name: {class_name}, Images: {num_images}")

    print("Total categories:", len(categories))
    print("Total images:", len(coco["images"]))


## Inspect and Format Bus Dataset

In [26]:
#train
print("Train:")
ann_file = r"Buses\train\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nValid:")
ann_file = r"Buses\valid\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nTest:")
ann_file = r"Buses\test\_annotations.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 0, Name: t-ZsBF, Images: 0
ID: 1, Name: bus, Images: 3960
Total categories: 2
Total images: 3960

Valid:
Categories in dataset:
ID: 0, Name: t-ZsBF, Images: 0
ID: 1, Name: bus, Images: 467
Total categories: 2
Total images: 467

Test:
Categories in dataset:
ID: 0, Name: t-ZsBF, Images: 0
ID: 1, Name: bus, Images: 220
Total categories: 2
Total images: 220


In [27]:
import json

def clean_coco_dataset(input_file, output_file):
    with open(input_file, "r") as f:
        coco = json.load(f)

    # Keep only category "bus" (id=1) but change its id to 0
    new_categories = []
    old_to_new_cat_id = {}

    for cat in coco["categories"]:
        if cat["id"] == 1:  # keep bus
            old_to_new_cat_id[1] = 0
            new_categories.append({"id": 0, "name": "bus"})

    # Update annotations
    new_annotations = []
    for ann in coco["annotations"]:
        if ann["category_id"] in old_to_new_cat_id:
            ann["category_id"] = old_to_new_cat_id[ann["category_id"]]
            new_annotations.append(ann)

    # Build cleaned COCO dict
    cleaned_coco = {
        "images": coco["images"],
        "annotations": new_annotations,
        "categories": new_categories
    }

    # Save new JSON
    with open(output_file, "w") as f:
        json.dump(cleaned_coco, f, indent=2)

In [28]:
# train
input_file = r"Buses\train\_annotations.coco.json"
output_file = r"Buses\train\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("Train:")
ann_file = r"Buses\train\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# valid
input_file = r"Buses\valid\_annotations.coco.json"
output_file = r"Buses\valid\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nValid:")
ann_file = r"Buses\valid\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# test
input_file = r"Buses\test\_annotations.coco.json"
output_file = r"Buses\test\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nTest:")
ann_file = r"Buses\test\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 0, Name: bus, Images: 3960
Total categories: 1
Total images: 3960

Valid:
Categories in dataset:
ID: 0, Name: bus, Images: 467
Total categories: 1
Total images: 467

Test:
Categories in dataset:
ID: 0, Name: bus, Images: 220
Total categories: 1
Total images: 220


## Inspecting Cars Dataset

In [29]:
#train
print("Train:")
ann_file = r"Cars\train\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nValid:")
ann_file = r"Cars\valid\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nTest:")
ann_file = r"Cars\test\_annotations.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 0, Name: Cars, Images: 0
ID: 1, Name: car, Images: 447
ID: 2, Name: cars, Images: 3483
Total categories: 3
Total images: 3930

Valid:
Categories in dataset:
ID: 0, Name: Cars, Images: 0
ID: 1, Name: car, Images: 8
ID: 2, Name: cars, Images: 488
Total categories: 3
Total images: 496

Test:
Categories in dataset:
ID: 0, Name: Cars, Images: 0
ID: 1, Name: car, Images: 28
ID: 2, Name: cars, Images: 352
Total categories: 3
Total images: 380


In [30]:
import json

def clean_coco_dataset(input_file, output_file):
    with open(input_file, "r") as f:
        coco = json.load(f)

    # Mapping: old id 1 -> 1, old id 2 -> 1 (merge into 'car')
    old_to_new_cat_id = {1: 1, 2: 1}

    # Define final category list (only one: car with id=1)
    new_categories = [{"id": 1, "name": "car"}]

    # Update annotations
    new_annotations = []
    for ann in coco["annotations"]:
        if ann["category_id"] in old_to_new_cat_id:
            ann["category_id"] = old_to_new_cat_id[ann["category_id"]]
            new_annotations.append(ann)

    # Build cleaned COCO dict
    cleaned_coco = {
        "images": coco["images"],
        "annotations": new_annotations,
        "categories": new_categories
    }

    # Save new JSON
    with open(output_file, "w") as f:
        json.dump(cleaned_coco, f, indent=2)

In [31]:
# train
input_file = r"Cars\train\_annotations.coco.json"
output_file = r"Cars\train\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("Train:")
ann_file = r"Cars\train\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# valid
input_file = r"Cars\valid\_annotations.coco.json"
output_file = r"Cars\valid\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nValid:")
ann_file = r"Cars\valid\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# test
input_file = r"Cars\test\_annotations.coco.json"
output_file = r"Cars\test\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nTest:")
ann_file = r"Cars\test\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 1, Name: car, Images: 3930
Total categories: 1
Total images: 3930

Valid:
Categories in dataset:
ID: 1, Name: car, Images: 496
Total categories: 1
Total images: 496

Test:
Categories in dataset:
ID: 1, Name: car, Images: 380
Total categories: 1
Total images: 380


## Inspecting Trucks Dataset

In [32]:
#train
print("Train:")
ann_file = r"Trucks\train\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nValid:")
ann_file = r"Trucks\valid\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nTest:")
ann_file = r"Trucks\test\_annotations.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 0, Name: truck, Images: 0
ID: 1, Name: Truck, Images: 711
ID: 2, Name: truck, Images: 3192
Total categories: 3
Total images: 3903

Valid:
Categories in dataset:
ID: 0, Name: truck, Images: 0
ID: 1, Name: Truck, Images: 75
ID: 2, Name: truck, Images: 388
Total categories: 3
Total images: 463

Test:
Categories in dataset:
ID: 0, Name: truck, Images: 0
ID: 1, Name: Truck, Images: 20
ID: 2, Name: truck, Images: 87
Total categories: 3
Total images: 107


In [36]:
import json

def clean_coco_dataset(input_file, output_file):
    with open(input_file, "r") as f:
        coco = json.load(f)

    # Mapping: old id 1 -> 2, keep id 2 as 2
    old_to_new_cat_id = {1: 2, 2: 2}

    # Define final category list (only one: truck with id=2)
    new_categories = [{"id": 2, "name": "truck"}]

    # Update annotations
    new_annotations = []
    for ann in coco["annotations"]:
        if ann["category_id"] in old_to_new_cat_id:
            ann["category_id"] = old_to_new_cat_id[ann["category_id"]]
            new_annotations.append(ann)

    # Build cleaned COCO dict
    cleaned_coco = {
        "images": coco["images"],
        "annotations": new_annotations,
        "categories": new_categories
    }

    # Save new JSON
    with open(output_file, "w") as f:
        json.dump(cleaned_coco, f, indent=2)

In [37]:
# train
input_file = r"Trucks\train\_annotations.coco.json"
output_file = r"Trucks\train\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("Train:")
ann_file = r"Trucks\train\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# valid
input_file = r"Trucks\valid\_annotations.coco.json"
output_file = r"Trucks\valid\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nValid:")
ann_file = r"Trucks\valid\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# test
input_file = r"Trucks\test\_annotations.coco.json"
output_file = r"Trucks\test\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nTest:")
ann_file = r"Trucks\test\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 2, Name: truck, Images: 3903
Total categories: 1
Total images: 3903

Valid:
Categories in dataset:
ID: 2, Name: truck, Images: 463
Total categories: 1
Total images: 463

Test:
Categories in dataset:
ID: 2, Name: truck, Images: 107
Total categories: 1
Total images: 107


## Inspecting Vans Dataset

In [38]:
#train
print("Train:")
ann_file = r"Vans\train\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nValid:")
ann_file = r"Vans\valid\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nTest:")
ann_file = r"Vans\test\_annotations.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 0, Name: V, Images: 0
ID: 1, Name: 0, Images: 3939
Total categories: 2
Total images: 3939

Valid:
Categories in dataset:
ID: 0, Name: V, Images: 0
ID: 1, Name: 0, Images: 194
Total categories: 2
Total images: 194

Test:
Categories in dataset:
ID: 0, Name: V, Images: 0
ID: 1, Name: 0, Images: 64
Total categories: 2
Total images: 64


In [39]:
import json

def clean_coco_dataset(input_file, output_file):
    with open(input_file, "r") as f:
        coco = json.load(f)

    # Mapping: old id 1 -> 3
    old_to_new_cat_id = {1: 3}

    # Define final category list (only one: van with id=3)
    new_categories = [{"id": 3, "name": "van"}]

    # Update annotations
    new_annotations = []
    for ann in coco["annotations"]:
        if ann["category_id"] in old_to_new_cat_id:
            ann["category_id"] = old_to_new_cat_id[ann["category_id"]]
            new_annotations.append(ann)

    # Build cleaned COCO dict
    cleaned_coco = {
        "images": coco["images"],
        "annotations": new_annotations,
        "categories": new_categories
    }

    # Save new JSON
    with open(output_file, "w") as f:
        json.dump(cleaned_coco, f, indent=2)

In [40]:
# train
input_file = r"Vans\train\_annotations.coco.json"
output_file = r"Vans\train\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("Train:")
ann_file = r"Vans\train\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# valid
input_file = r"Vans\valid\_annotations.coco.json"
output_file = r"Vans\valid\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nValid:")
ann_file = r"Vans\valid\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# test
input_file = r"Vans\test\_annotations.coco.json"
output_file = r"Vans\test\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nTest:")
ann_file = r"Vans\test\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 3, Name: van, Images: 3939
Total categories: 1
Total images: 3939

Valid:
Categories in dataset:
ID: 3, Name: van, Images: 194
Total categories: 1
Total images: 194

Test:
Categories in dataset:
ID: 3, Name: van, Images: 64
Total categories: 1
Total images: 64


## Inspecting Motorcycles

In [41]:
#train
print("Train:")
ann_file = r"Motorcycles\train\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nValid:")
ann_file = r"Motorcycles\valid\_annotations.coco.json"
analyze_coco_dataset(ann_file)
print("\nTest:")
ann_file = r"Motorcycles\test\_annotations.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 0, Name: Motorcycle, Images: 0
ID: 1, Name: Motorcycle, Images: 3876
Total categories: 2
Total images: 3924

Valid:
Categories in dataset:
ID: 0, Name: Motorcycle, Images: 0
ID: 1, Name: Motorcycle, Images: 489
Total categories: 2
Total images: 497

Test:
Categories in dataset:
ID: 0, Name: Motorcycle, Images: 0
ID: 1, Name: Motorcycle, Images: 193
Total categories: 2
Total images: 195


In [43]:
import json

def clean_coco_dataset(input_file, output_file):
    with open(input_file, "r") as f:
        coco = json.load(f)

    # Mapping: old id 1 -> 4
    old_to_new_cat_id = {1: 4}

    # Define final category list (only one: motorcycle with id=4)
    new_categories = [{"id": 4, "name": "motorcycle"}]

    # Update annotations
    new_annotations = []
    for ann in coco["annotations"]:
        if ann["category_id"] in old_to_new_cat_id:
            ann["category_id"] = old_to_new_cat_id[ann["category_id"]]
            new_annotations.append(ann)

    # Build cleaned COCO dict
    cleaned_coco = {
        "images": coco["images"],
        "annotations": new_annotations,
        "categories": new_categories
    }

    # Save new JSON
    with open(output_file, "w") as f:
        json.dump(cleaned_coco, f, indent=2)

In [44]:
# train
input_file = r"Motorcycles\train\_annotations.coco.json"
output_file = r"Motorcycles\train\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("Train:")
ann_file = r"Motorcycles\train\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# valid
input_file = r"Motorcycles\valid\_annotations.coco.json"
output_file = r"Motorcycles\valid\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nValid:")
ann_file = r"Motorcycles\valid\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)
# test
input_file = r"Motorcycles\test\_annotations.coco.json"
output_file = r"Motorcycles\test\_annotations_cleaned.coco.json"
clean_coco_dataset(input_file, output_file)
print("\nTest:")
ann_file = r"Motorcycles\test\_annotations_cleaned.coco.json"
analyze_coco_dataset(ann_file)

Train:
Categories in dataset:
ID: 4, Name: motorcycle, Images: 3876
Total categories: 1
Total images: 3924

Valid:
Categories in dataset:
ID: 4, Name: motorcycle, Images: 489
Total categories: 1
Total images: 497

Test:
Categories in dataset:
ID: 4, Name: motorcycle, Images: 193
Total categories: 1
Total images: 195


| ClasID       | Class Name |
| ------------ | ----------- |
| 0            | Bus         |
| 1            | Cars        |
| 2            | Trucks      |
| 3            | Vans        |
| 4            | Motorcycle  |

