In [7]:
import json
import os
from pycocotools.coco import COCO
from shutil import copyfile
import random

In [2]:
# Paths to your downloaded COCO dataset
annotations_file = 'data/annotations/instances_val2014.json'
images_dir = 'data/val2014'
output_dir = 'data/person_val2014/'
output_annotations_file = 'data/annotations/person_instances_val2014.json'

In [3]:
# Load COCO annotations
coco = COCO(annotations_file)

loading annotations into memory...
Done (t=2.69s)
creating index...
index created!


In [4]:
# Get all category IDs and their names
categories = coco.loadCats(coco.getCatIds())
category_names = [cat['name'] for cat in categories]
category_id_map = {cat['name']: cat['id'] for cat in categories}

# Get the category ID for "person"
person_category_id = category_id_map['person']

# Get all image IDs that contain a person
person_image_ids = coco.getImgIds(catIds=[person_category_id])

In [8]:
# Define the undersampling ratio or the number of images to keep
downsample_ratio = 0.33  # For example, keep 10% of the images
num_images_to_keep = int(len(person_image_ids) * downsample_ratio)

# Randomly select a subset of the image IDs
selected_image_ids = random.sample(person_image_ids, num_images_to_keep)

# Get all person annotations
person_annotations = coco.loadAnns(coco.getAnnIds(imgIds=selected_image_ids, catIds=[person_category_id]))

# Create output directory for person images
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Copy selected person images to the output directory
for img_id in selected_image_ids:
    img_info = coco.loadImgs(img_id)[0]
    img_filename = img_info['file_name']
    src_path = os.path.join(images_dir, img_filename)
    dst_path = os.path.join(output_dir, img_filename)
    copyfile(src_path, dst_path)

# Save person annotations to a new JSON file
person_data = {
    'images': [coco.loadImgs(img_id)[0] for img_id in selected_image_ids],
    'annotations': person_annotations,
    'categories': [cat for cat in categories if cat['id'] == person_category_id]
}

with open(output_annotations_file, 'w') as f:
    json.dump(person_data, f)

print(f'Extracted and downsampled to {num_images_to_keep} person images and annotations.')


Extracted and downsampled to 7139 person images and annotations.


In [None]:
# Paths to your two COCO datasets
dataset1_annotations_file = 'dataset1/annotations/instances_train2017.json'
dataset1_images_dir = 'dataset1/train2017/'
dataset2_annotations_file = 'dataset2/annotations/instances_train2017.json'
dataset2_images_dir = 'dataset2/train2017/'

# Output paths
output_dir = 'combined_dataset/train2017/'
output_annotations_file = 'combined_dataset/annotations/instances_train2017.json'

# Create output directories if they don't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the annotations for both datasets
with open(dataset1_annotations_file, 'r') as f:
    dataset1 = json.load(f)
with open(dataset2_annotations_file, 'r') as f:
    dataset2 = json.load(f)

# Ensure category IDs are unique across both datasets
# Assuming each dataset has only one class, we can simply append them and adjust IDs if needed
dataset2_category_id_offset = max(cat['id'] for cat in dataset1['categories']) + 1
for category in dataset2['categories']:
    category['id'] += dataset2_category_id_offset

# Merge categories
merged_categories = dataset1['categories'] + dataset2['categories']

# Ensure image IDs are unique across both datasets
dataset2_image_id_offset = max(img['id'] for img in dataset1['images']) + 1
for image in dataset2['images']:
    image['id'] += dataset2_image_id_offset

# Ensure annotation IDs are unique and update image IDs in annotations
dataset2_annotation_id_offset = max(ann['id'] for ann in dataset1['annotations']) + 1
for annotation in dataset2['annotations']:
    annotation['id'] += dataset2_annotation_id_offset
    annotation['image_id'] += dataset2_image_id_offset

# Merge images and annotations
merged_images = dataset1['images'] + dataset2['images']
merged_annotations = dataset1['annotations'] + dataset2['annotations']

# Copy images to the output directory
for image in merged_images:
    src_path = os.path.join(dataset1_images_dir if image['id'] < dataset2_image_id_offset else dataset2_images_dir, image['file_name'])
    dst_path = os.path.join(output_dir, image['file_name'])
    if not os.path.exists(dst_path):
        copyfile(src_path, dst_path)

# Save the merged annotations to a new JSON file
merged_data = {
    'images': merged_images,
    'annotations': merged_annotations,
    'categories': merged_categories
}

with open(output_annotations_file, 'w') as f:
    json.dump(merged_data, f)

print(f'Combined dataset created with {len(merged_images)} images and {len(merged_annotations)} annotations.')
