# Convert annotations to MS Coco format with more meta fields

Convert datasets: train 500k, validation and test

We added into annotations:
- 'IsOccluded'
- 'IsTruncated'
- 'IsDepiction'
- 'IsInside'

Apply prefiltering: 
- images with any dimension larger 2000 pixels are ignored
- do not write annotations without bboxes

In [1]:
!ls ../input/

as_mscoco
challenge2018
class-descriptions-boxable.csv
lost+found
output-OpenImagesObjectDetections
test
test-annotations-bbox.csv
test-annotations-human-imagelabels-boxable.csv
train
train-annotations-bbox.csv
train-annotations-human-imagelabels-boxable.csv
train-images-boxable-with-rotation.csv
validation
validation-annotations-bbox.csv
validation-annotations-human-imagelabels-boxable.csv


In [2]:
!ls ../input/train | wc -l
!ls ../input/test | wc -l
!ls ../input/validation | wc -l

505563
125436
41620


In [3]:
try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path

import numpy as np
import pandas as pd

import json
from PIL import Image

import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [4]:
TRAIN_IMAGES_PATH = Path(".").resolve().parent / "input" / "train"
TRAIN_ANNOTATIONS_CSV_PATH = Path(".").resolve().parent / "input" / "train-annotations-bbox.csv"
TRAIN_CONFIDENCE_CSV_PATH = Path(".").resolve().parent / "input" / "train-annotations-human-imagelabels-boxable.csv"
TRAIN_IMGINFO_CSV_PATH = Path(".").resolve().parent / "input" / "train-images-with-rotation.csv"
VALIDATION_IMAGES_PATH = Path(".").resolve().parent / "input" / "validation"
VALIDATION_ANNOTATIONS_CSV_PATH = Path(".").resolve().parent / "input" / "validation-annotations-bbox.csv"
VALIDATION_CONFIDENCE_CSV_PATH = Path(".").resolve().parent / "input" / "validation-annotations-human-imagelabels-boxable.csv"
VALIDATION_IMGINFO_CSV_PATH = Path(".").resolve().parent / "input" / "validation-images-with-rotation.csv"
LABELS_DESCRIPTION_CSV_PATH = Path(".").resolve().parent / "input" / "class-descriptions-boxable.csv"

In [5]:
TEST_IMAGES_PATH = Path(".").resolve().parent / "input" / "test"
TEST_ANNOTATIONS_CSV_PATH = Path(".").resolve().parent / "input" / "test-annotations-bbox.csv"
TEST_CONFIDENCE_CSV_PATH = Path(".").resolve().parent / "input" / "test-annotations-human-imagelabels-boxable.csv"

In [6]:
labels_description = pd.read_csv(LABELS_DESCRIPTION_CSV_PATH, header=None)
labels = labels_description[1].values.tolist()

coco_categories = []
for i, label in enumerate(labels):
    coco_categories.append({
        'id': i,
        'name': label,
        'supercategory': label
    })
    
categories = {}
for d in coco_categories:
    categories[d['name']] = d['id']    

In [7]:
min(list(categories.values())), max(list(categories.values()))

(0, 600)

In [8]:
xyxy_cols = ['XMin', 'YMin', 'XMax', 'YMax']
meta_cols = ['IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside']
ignore_is_crowd = True


def to_pixels(xs, scale, vmin, vmax):
    return np.clip(xs * scale, vmin, vmax)

def get_bboxes_labels_meta(canvas_size, image_id, annotations):
    bboxes = annotations.loc[image_id, xyxy_cols].values
    labels = annotations.loc[image_id, 'LabelName']
    meta = annotations.loc[image_id, meta_cols].values
    
    if bboxes.ndim == 1:
        bboxes = bboxes[None, :]
        meta = meta[None, :]
    
    if isinstance(labels, str):        
        labels = np.array([labels, ])
        
    # BBox format should be (x, y, w, h)
    bboxes[:, 0] = to_pixels(bboxes[:, 0], canvas_size[0], 0, canvas_size[0] - 1)
    bboxes[:, 1] = to_pixels(bboxes[:, 1], canvas_size[1], 0, canvas_size[1] - 1)
    
    bboxes[:, 2] = to_pixels(bboxes[:, 2], canvas_size[0], 0, canvas_size[0] - 1)
    bboxes[:, 2] -= bboxes[:, 0] - 1
    
    bboxes[:, 3] = to_pixels(bboxes[:, 3], canvas_size[1], 0, canvas_size[1] - 1)
    bboxes[:, 3] -= bboxes[:, 1] - 1
    return bboxes, labels, meta


def compute_area(bbox):
    return bbox[2] * bbox[3]


In [9]:
import tqdm
from joblib import Parallel, delayed


def _task(images_path, image_id):
    p = images_path / "{}.jpg".format(image_id)
    if not p.exists():
        return None

    img = Image.open(images_path / "{}.jpg".format(image_id))
    
    if max(img.size) > 2000 or min(img.size) < 100:
        return None

    image_info = {
            "id": image_id,
            "file_name": "{}.jpg".format(image_id),
            "width": img.size[0],
            "height": img.size[1],
    }    
    return image_info



def create_annotations_json(images_path, annotations, coco_categories, output_mode, 
                            image_ids=None, ignore_is_crowd=True):
    
    coco_images = []
    coco_annotations = []
    
    if image_ids is None:
        image_ids = annotations.index.unique()        
            
            
    image_infos = list([None] * len(image_ids))
    with Parallel(n_jobs=16) as parallel:
        bs = 48
        for i in tqdm.tqdm(range(0, len(image_ids), bs)):
            batch_image_ids = image_ids[i:i + bs]
            image_infos[i:i + bs] = parallel(delayed(_task)(images_path, image_id) for image_id in batch_image_ids)
    image_infos = [i for i in image_infos if i is not None]
                    
    for image_info in tqdm.tqdm(image_infos):
        
        image_id = image_info['id']
        img_size = (image_info['width'], image_info['height'])
                
        bboxes, labels, meta = get_bboxes_labels_meta(img_size, image_id, annotations)

        if len(bboxes) == 0:
            print("No bboxes for image_id '{}'".format(image_id))
            continue
        
        num_added_annotations = 0
        
        for i, (bbox, label, m) in enumerate(zip(bboxes, labels, meta)):
            m = [int(v) for v in m]
            bbox = [int(v) for v in bbox.tolist()]
            label = categories[label]
            
            if bbox[2] < 1 or bbox[3] < 1:
                continue

            if label < 0 or label > 600:
                continue
            
            annotation_id = hash(image_id + "_{}".format(i))
            annotation_info = {
                "id": annotation_id,
                "image_id": image_id,
                "category_id": label,
                "IsOccluded": m[0],
                "IsTruncated": m[1],
                "iscrowd": m[2] if not ignore_is_crowd else 0,
                "IsDepiction": m[3],
                "IsInside": m[4],            
                "area": int(compute_area(bbox)),
                "bbox": bbox,
                "segmentation": [],
            } 
            coco_annotations.append(annotation_info)  
            num_added_annotations += 1

        if num_added_annotations > 0:
            coco_images.append(image_info)

    output_coco_annotations = {
        "categories": coco_categories,
        "images": coco_images,
        "annotations": coco_annotations
    }
    
    output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
    if not output_folder.exists():
        output_folder.mkdir(parents=True)
    
    with open((output_folder / "{}.json".format(output_mode)).as_posix(), 'w') as h:
        json.dump(output_coco_annotations, h)    

Create train dataset:

In [10]:
images_path = TRAIN_IMAGES_PATH
annotations_path = TRAIN_ANNOTATIONS_CSV_PATH
output_mode = "train"

annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])

In [None]:
create_annotations_json(images_path, annotations, coco_categories, output_mode, 
                        ignore_is_crowd=ignore_is_crowd)

 10%|▉         | 3457/36314 [01:04<10:10, 53.84it/s]

Create validation dataset:

In [None]:
images_path = VALIDATION_IMAGES_PATH
annotations_path = VALIDATION_ANNOTATIONS_CSV_PATH
output_mode = "val"

annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])

In [None]:
create_annotations_json(images_path, annotations, coco_categories, output_mode, 
                        ignore_is_crowd=ignore_is_crowd)

Create test dataset:

In [None]:
images_path = TEST_IMAGES_PATH
annotations_path = TEST_ANNOTATIONS_CSV_PATH
output_mode = "test"

annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])

In [None]:
create_annotations_json(images_path, annotations, coco_categories, output_mode, 
                        ignore_is_crowd=ignore_is_crowd)

Create train dataset to check overfitting

- 10 images from test

In [16]:
images_path = TEST_IMAGES_PATH
annotations_path = TEST_ANNOTATIONS_CSV_PATH
output_mode = "train_overfit"

annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])
image_ids = annotations.index.unique()
image_ids = image_ids[:10]

In [17]:
create_annotations_json(images_path, annotations, coco_categories, output_mode, 
                        ignore_is_crowd=ignore_is_crowd, image_ids=image_ids)

100%|██████████| 10/10 [00:00<00:00, 87.62it/s]


Create symlinks

In [22]:
!ls -all ../input/as_mscoco/

total 36
drwxr-xr-x 6 root root 4096 Aug  4 18:20 .
drwxrwxr-x 7 1000 1000 4096 Aug  4 13:13 ..
drwxr-xr-x 2 root root 4096 Aug  4 14:11 annotations
drwxr-xr-x 3 root root 4096 Jul 12 23:31 annotations_with_iscrowd
drwxr-xr-x 2 root root 4096 Jul  6 23:13 .ipynb_checkpoints
lrwxrwxrwx 1 root root   70 Jul  6 23:10 test -> /home/working_directory/ml/kaggle/OpenImagesObjectDetection/input/test
lrwxrwxrwx 1 root root   71 Aug  4 18:20 train -> /home/working_directory/ml/kaggle/OpenImagesObjectDetection/input/train
drwxr-xr-x 2 root root 4096 Jul 13 06:48 train_overfit
lrwxrwxrwx 1 root root   76 Jul  6 23:03 val -> /home/working_directory/ml/kaggle/OpenImagesObjectDetection/input/validation


In [20]:
output_mode = "train"
images_path = TRAIN_IMAGES_PATH

In [21]:
output_images_folder = Path(".").resolve().parent / "input" / "as_mscoco" / output_mode
if not output_images_folder.exists():
    output_images_folder.symlink_to(images_path, target_is_directory=True)

In [13]:
output_mode = "val"
images_path = VALIDATION_IMAGES_PATH

In [14]:
output_images_folder = Path(".").resolve().parent / "input" / "as_mscoco" / output_mode
if not output_images_folder.exists():
    output_images_folder.symlink_to(images_path, target_is_directory=True)

In [15]:
output_mode = "test"
images_path = TEST_IMAGES_PATH

In [16]:
output_images_folder = Path(".").resolve().parent / "input" / "as_mscoco" / output_mode
if not output_images_folder.exists():
    output_images_folder.symlink_to(images_path, target_is_directory=True)

In [47]:
output_mode = "train_overfit"

In [48]:
output_images_folder = Path(".").resolve().parent / "input" / "as_mscoco" / output_mode

if not output_images_folder.exists():
    output_images_folder.mkdir()

for image_id in image_ids:
    !ln -s {images_path.as_posix()}/{image_id}.jpg {output_images_folder}/{image_id}.jpg 


In [49]:
!ls {output_images_folder}

000026e7ee790996.jpg  0002ab0af02e4a77.jpg  00045d609ca3f4eb.jpg
000062a39995e348.jpg  0002cc8afaf1b611.jpg  00068d5450f0358b.jpg
0000c64e1253d68f.jpg  0003d84e0165d630.jpg
000132c20b84269b.jpg  000411001ff7dd4f.jpg


In [21]:
!ls -all ../input/as_mscoco/

total 16
drwxr-xr-x 4 root root 4096 Aug  2 06:31 .
drwxrwxrwx 8 1000 1000 4096 Jul 28 18:47 ..
drwxr-xr-x 2 root root 4096 Aug  2 00:27 annotations
lrwxrwxrwx 1 root root   31 Jul 28 14:58 test -> /home/project/oiv4od/input/test
lrwxrwxrwx 1 root root   32 Aug  2 06:31 train -> /home/project/oiv4od/input/train
drwxr-xr-x 2 root root 4096 Jul 28 11:22 train_overfit
lrwxrwxrwx 1 root root   37 Jul 28 14:58 val -> /home/project/oiv4od/input/validation


Test with pycocotools

In [19]:
from pycocotools import coco

In [20]:
output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 

coco = coco.COCO((output_folder / "test.json").as_posix())

loading annotations into memory...
Done (t=6.48s)
creating index...
index created!


In [22]:
image_ids = coco.getImgIds()
image_ids.sort()
roidb = coco.loadImgs(image_ids)

In [25]:
type(roidb), len(roidb) * 2

(list, 215976)

In [16]:
coco.loadImgs(['5840d582ce4fbe93', ])

[{'file_name': '5840d582ce4fbe93.jpg',
  'height': 683,
  'id': '5840d582ce4fbe93',
  'width': 1024}]

Check complete datasets on errors:
- no annotations, 
- annotation has zero or negative size
- annotation is out of bounds

In [40]:
import json

output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
annotations_file = output_folder / "train.json"

with open(annotations_file.as_posix(), 'r') as h:
    annotations = json.load(h)

In [41]:
annotations_images = {}
for im in annotations['images']:
    annotations_images[im['id']] = im

In [42]:
invalid_bboxes = []

for a in annotations['annotations']:
    bbox = a['bbox']
    img_info = annotations_images[a['image_id']]
    w = img_info['width']
    h = img_info['height']
    assert 0 <= bbox[0] <= w, "Problem with {}, {}".format(a, img_info)
    assert 0 <= bbox[1] <= h, "Problem with {}, {}".format(a, img_info)
    assert 0 <= bbox[0] + bbox[2] <= w, "Problem with {}, {}".format(a, img_info)
    assert 0 <= bbox[1] + bbox[3] <= h, "Problem with {}, {}".format(a, img_info)
    if bbox[2] < 1 or bbox[3] < 1:
        invalid_bboxes.append((a['image_id'], a['id'], bbox))

In [43]:
len(annotations['annotations']), len(annotations['images'])

(3707911, 474169)

In [34]:
images_path = VALIDATION_IMAGES_PATH
annotations_path = VALIDATION_ANNOTATIONS_CSV_PATH

annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])

In [35]:
image_id = "a2f7ab86fb274aa0"

img = Image.open(images_path / "{}.jpg".format(image_id))

if max(img.size) > 2000 or min(img.size) < 100:
    raise RuntimeError("")

image_info = {
        "id": image_id,
        "file_name": "{}.jpg".format(image_id),
        "width": img.size[0],
        "height": img.size[1],
}    
bboxes, labels, meta = get_bboxes_labels_meta(img.size, image_id)

In [39]:
coco_annotations = []
for i, (bbox, label, m) in enumerate(zip(bboxes, labels, meta)):
    m = [int(v) for v in m]
    bbox = [int(v) for v in bbox.tolist()]
    annotation_id = hash(image_id + "_{}".format(i))
    annotation_info = {
        "id": annotation_id,
        "image_id": image_id,
        "category_id": categories[label],
        "IsOccluded": m[0],
        "IsTruncated": m[1],
        "iscrowd": m[2] if not ignore_is_crowd else 0,
        "IsDepiction": m[3],
        "IsInside": m[4],            
        "area": int(compute_area(bbox)),
        "bbox": bbox,
        "segmentation": [],
    }
    coco_annotations.append(annotation_info)

In [40]:
coco_annotations

[{'IsDepiction': 1,
  'IsInside': 0,
  'IsOccluded': 0,
  'IsTruncated': 0,
  'area': 132,
  'bbox': [127, 711, 11, 12],
  'category_id': 14,
  'id': -9011256856298810697,
  'image_id': 'a2f7ab86fb274aa0',
  'iscrowd': 0,
  'segmentation': []},
 {'IsDepiction': 1,
  'IsInside': 0,
  'IsOccluded': 0,
  'IsTruncated': 0,
  'area': 180,
  'bbox': [131, 461, 10, 18],
  'category_id': 14,
  'id': -9011256856298810698,
  'image_id': 'a2f7ab86fb274aa0',
  'iscrowd': 0,
  'segmentation': []},
 {'IsDepiction': 1,
  'IsInside': 0,
  'IsOccluded': 0,
  'IsTruncated': 0,
  'area': 96,
  'bbox': [133, 492, 8, 12],
  'category_id': 14,
  'id': -9011256856298810699,
  'image_id': 'a2f7ab86fb274aa0',
  'iscrowd': 0,
  'segmentation': []},
 {'IsDepiction': 1,
  'IsInside': 0,
  'IsOccluded': 0,
  'IsTruncated': 0,
  'area': 150,
  'bbox': [134, 738, 10, 15],
  'category_id': 14,
  'id': -9011256856298810700,
  'image_id': 'a2f7ab86fb274aa0',
  'iscrowd': 0,
  'segmentation': []},
 {'IsDepiction': 1,
  

In [12]:
import json

output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
annotations_file = output_folder / "train_overfit.json"

with open(annotations_file.as_posix(), 'r') as h:
    annotations = json.load(h)

In [14]:
for a in annotations['annotations']:
    bbox = a['bbox']
    img_info = [im for im in annotations['images'] if im['id'] == a['image_id']]
    assert len(img_info) == 1
    img_info = img_info[0]
    w = img_info['width']
    h = img_info['height']
    assert 0 <= bbox[0] <= w, "Problem with {}, {}".format(a, img_info)
    assert 0 <= bbox[1] <= h, "Problem with {}, {}".format(a, img_info)
    assert 0 <= bbox[0] + bbox[2] <= w, "Problem with {}, {}".format(a, img_info)
    assert 0 <= bbox[1] + bbox[3] <= h, "Problem with {}, {}".format(a, img_info)