In [22]:
import json
filepath = "./instances_val2017.json"
with open(filepath, 'r') as file:
    data = json.load(file)

In [14]:
len(data['images'])

5000

In [15]:
import copy
def filter_data(data, min_id, max_id):
    data_copy = copy.deepcopy(data)
    # Filter categories
    filtered_categories = [cat for cat in data_copy['categories'] if min_id <= cat['id'] <= max_id]
    valid_ids = {cat['id'] for cat in filtered_categories}

    # Filter annotations based on valid category_ids
    filtered_annotations = [anno for anno in data_copy['annotations'] if anno['category_id'] in valid_ids]
    valid_image_ids = {anno['image_id'] for anno in filtered_annotations}

    # Filter images based on remaining valid image_ids
    filtered_images = [img for img in data_copy['images'] if img['id'] in valid_image_ids]

    # Update the data dictionary with filtered data
    data_copy['categories'] = filtered_categories
    data_copy['annotations'] = filtered_annotations
    data_copy['images'] = filtered_images

    return data_copy

In [16]:
# MAX is 79
def gen_dataset(start_category_id, end_category_id, filepath = "./instances_val2017.json"):
    def save_json(data, filename):
        with open(filename, 'w') as file:
            json.dump(data, file)
            
    with open(filepath, 'r') as file:
        data = json.load(file)

    filtered_data = filter_data(data, start_category_id, end_category_id)
    save_json(filtered_data, f'filtered_data_{start_category_id}-{end_category_id}.json')
    print('saved')
    

## Generate IN dataset - include train and test, split it later

In [18]:
# now I will only use 50 total categories - 0~59
# ID from 0~39
# OOD from 40~59

ID_RANGE = [0, 39]
OOD_RANGE = [40, 59]

# then split the ID into training and testing. OOD are all testing

# This is the IN data, we will split the IN test data into train and test later
import os
gen_dataset(start_category_id=ID_RANGE[0], end_category_id=ID_RANGE[1])
current_file_name = f'filtered_data_{ID_RANGE[0]}-{ID_RANGE[1]}.json'
new_file_name = f'IN_data_{ID_RANGE[0]}-{ID_RANGE[1]}.json'
os.rename(current_file_name, new_file_name)


saved


# Generate OOD test dataset

In [19]:
# Generate OOD test dataset - rename it as OOD_TEST_data_40-59.json
import os
gen_dataset(start_category_id=OOD_RANGE[0], end_category_id=OOD_RANGE[1])
current_file_name = f'filtered_data_{OOD_RANGE[0]}-{OOD_RANGE[1]}.json'
new_file_name = f'OOD_TEST_data_{OOD_RANGE[0]}-{OOD_RANGE[1]}.json'
os.rename(current_file_name, new_file_name)

saved


## Split the IN data into train and test

In [20]:
import json
import random

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def save_json(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file)

def split_data(data, ratio=0.8):
    # Extract image IDs
    image_ids = [img['id'] for img in data['images']]
    
    # Shuffle image IDs to randomize the split
    random.shuffle(image_ids)
    
    # Calculate the split index
    split_index = int(len(image_ids) * ratio)
    
    # Split image IDs into training and testing sets
    train_image_ids = set(image_ids[:split_index])
    test_image_ids = set(image_ids[split_index:])
    
    # Create train and test datasets based on image IDs
    train_data = {
        'images': [img for img in data['images'] if img['id'] in train_image_ids],
        'annotations': [anno for anno in data['annotations'] if anno['image_id'] in train_image_ids],
        'categories': data['categories']
    }
    test_data = {
        'images': [img for img in data['images'] if img['id'] in test_image_ids],
        'annotations': [anno for anno in data['annotations'] if anno['image_id'] in test_image_ids],
        'categories': data['categories']
    }
    
    return train_data, test_data



In [21]:
data = load_json('IN_data_0-39.json')

# Split the data with a specific ratio
train_data, test_data = split_data(data, ratio=0.8)

# Save the split data
save_json(train_data, 'IN_TRAIN_data_0-39.json')
save_json(test_data, 'IN_TEST_data_0-39.json')

print("Data has been split into training and testing sets.")

Data has been split into training and testing sets.


Now you have following datasets:
- training data:
    - IN_TRAIN_data_0-39.json
- testing dataL
    - IN_TEST_data_0-39.json
    - OOD_TEST_data_40-59.json