In [1]:
from nuimages import NuImages
import os
import json
from tqdm import tqdm
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool
from itertools import groupby
import numpy as np

In [2]:
dataroot = '/usr0/tma1/datasets/nuimages'

In [3]:
print('Creating category id files...')
category_file = os.path.join(dataroot, 'category_id.json')
if (not os.path.exists(category_file)):
    fd = open(os.path.join(dataroot, 'v1.0-train/category.json'), 'r')
    category = json.load(fd)
    category_id = dict()
    
    for i, cat in enumerate(category):
        category_id[cat['name']] = i
    
    fd2 = open(category_file, 'w+')
    json.dump(category_id, fd2)
    
    fd.close()
    fd2.close()
else:
    category_id_file = os.path.join(dataroot, 'category_id.json')
    fd = open(category_id_file, 'r')
    category_id = json.load(fd)
    fd.close()
print('Done!')

Creating category id files...
Done!


In [4]:
category_file_name = 'construction_vehicle'

In [5]:
# Find category_ids that are of interest
construction_names = ['vehicle.construction']
construction_org_ids = [category_id[name] for name in construction_names]
construction_new_ids = {category_id[name]: i for i, name in enumerate(construction_names)}
print({name: i for i, name in enumerate(construction_names)})
construction_org_ids, construction_new_ids

{'vehicle.construction': 0}


([18], {18: 0})

In [7]:
# Splits the train, val, test data such that only categories of interest are retained
splits = ['v1.0-train', 'v1.0-val', 'v1.0-test']
for split in splits:
    print("Filter detectron data for {}".format(split))
    path = os.path.join(os.path.join(dataroot, 'detectron_data'), split + '_detectron.json')
    fd = open(path, 'r')
    data = json.load(fd)
    
    filtered_data = []
    for d in data:
        filtered_anns = []
        d_anns = d['annotations']
            
        for ann in d_anns:
            if ann['category_id'] in construction_org_ids:
                ann['category_id'] = construction_new_ids[ann['category_id']]
                filtered_anns.append(ann)
        
        if (len(filtered_anns) > 0):
            d['annotations'] = filtered_anns
            filtered_data.append(d)
    
    path = os.path.join(os.path.join(dataroot, 'detectron_data'), split + '_{}_detectron.json'.format(category_file_name))
    fd2 = open(path, 'w+')
    json.dump(filtered_data, fd2)
    
    fd.close()
    fd2.close()
    print("... done")

Filter detectron data for v1.0-train
... done
Filter detectron data for v1.0-val
... done
Filter detectron data for v1.0-test
... done


In [8]:
# Split validation data into validation and test
path = os.path.join(os.path.join(dataroot, 'detectron_data'), 'v1.0-val_{}_detectron.json'.format(category_file_name))
print(path)
fd = open(path, 'r')
data = json.load(fd)
print(len(data))
num_instances = len(data)
indices = np.arange(num_instances)
np.random.shuffle(indices)
split_index = int(num_instances / 2)

data_val = list(np.array(data)[indices[:split_index]])
data_test = list(np.array(data)[indices[split_index:]])

val_path = os.path.join(os.path.join(dataroot, 'detectron_data'), 'val_val_{}_detectron.json'.format(category_file_name))
test_path = os.path.join(os.path.join(dataroot, 'detectron_data'), 'val_test_{}_detectron.json'.format(category_file_name))

fd_val = open(val_path, 'w+')
json.dump(data_val, fd_val)

fd_test = open(test_path, 'w+')
json.dump(data_test, fd_test)

fd_val.close()
fd_test.close()
fd.close()

#Done!

/usr0/tma1/datasets/nuimages/detectron_data/v1.0-val_construction_vehicle_detectron.json
1111


In [9]:
# Switch from list to dictionary format indexing by image_id as key
val_path = os.path.join(os.path.join(dataroot, 'detectron_data'), 'val_val_{}_detectron.json'.format(category_file_name))
fd = open(val_path, 'r')
data = json.load(fd)
fd.close()

val_path_data = os.path.join(os.path.join(dataroot, 'detectron_data'), 'v1.0-val_val_{}_detectron.json'.format(category_file_name))
val_data = dict()
for d in data:
    val_data[d['image_id']] = d

fd = open(val_path_data, 'w+')
json.dump(json.dumps(val_data), fd)

test_path = os.path.join(os.path.join(dataroot, 'detectron_data'), 'val_test_{}_detectron.json'.format(category_file_name))
fd = open(test_path, 'r')
data = json.load(fd)
fd.close()

test_path_data = os.path.join(os.path.join(dataroot, 'detectron_data'), 'v1.0-val_test_{}_detectron.json'.format(category_file_name))
test_data = dict()
for d in data:
    test_data[d['image_id']] = d

fd = open(test_path_data, 'w+')
json.dump(json.dumps(test_data), fd)