# Convert annotations to MS Coco format with more meta fields

Convert datasets: validation and test
```
test (108159) + 70% validation (25147) -> train
30% validation (10777) -> val
```
We added into annotations:
- 'IsOccluded'
- 'IsTruncated'
- 'IsDepiction'
- 'IsInside'

Apply prefiltering: 
- images with any dimension larger 2000 pixels are ignored
- do not write annotations without bboxes

In [1]:
try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path

import numpy as np
import pandas as pd

import json
from PIL import Image

import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [2]:
VALIDATION_IMAGES_PATH = Path(".").resolve().parent / "input" / "validation"
VALIDATION_ANNOTATIONS_CSV_PATH = Path(".").resolve().parent / "input" / "validation-annotations-bbox.csv"
VALIDATION_CONFIDENCE_CSV_PATH = Path(".").resolve().parent / "input" / "validation-annotations-human-imagelabels-boxable.csv"
VALIDATION_IMGINFO_CSV_PATH = Path(".").resolve().parent / "input" / "validation-images-with-rotation.csv"
LABELS_DESCRIPTION_CSV_PATH = Path(".").resolve().parent / "input" / "class-descriptions-boxable.csv"

In [3]:
TEST_IMAGES_PATH = Path(".").resolve().parent / "input" / "test"
TEST_ANNOTATIONS_CSV_PATH = Path(".").resolve().parent / "input" / "test-annotations-bbox.csv"
TEST_CONFIDENCE_CSV_PATH = Path(".").resolve().parent / "input" / "test-annotations-human-imagelabels-boxable.csv"

In [4]:
labels_description = pd.read_csv(LABELS_DESCRIPTION_CSV_PATH, header=None)
labels = labels_description[1].values.tolist()

coco_categories = []
for i, label in enumerate(labels):
    coco_categories.append({
        'id': i,
        'name': label,
        'supercategory': label
    })
    
categories = {}
for d in coco_categories:
    categories[d['name']] = d['id']    

In [5]:
xyxy_cols = ['XMin', 'YMin', 'XMax', 'YMax']
meta_cols = ['IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside']
ignore_is_crowd = True


def get_bboxes_labels_meta(canvas_size, image_id):
    bboxes = annotations.loc[image_id, xyxy_cols].values
    labels = annotations.loc[image_id, 'LabelName']
    meta = annotations.loc[image_id, meta_cols].values
    
    if bboxes.ndim == 1:
        bboxes = bboxes[None, :]
        meta = meta[None, :]
    
    if isinstance(labels, str):        
        labels = np.array([labels, ])
        
    # BBox format should be (x, y, w, h)
    bboxes[:, 0] *= canvas_size[0]
    bboxes[:, 1] *= canvas_size[1]
    
    bboxes[:, 2] *= canvas_size[0]
    bboxes[:, 2] -= bboxes[:, 0]
    
    bboxes[:, 3] *= canvas_size[1]
    bboxes[:, 3] -= bboxes[:, 1]
    return bboxes, labels, meta


def compute_area(bbox):
    return bbox[2] * bbox[3]


Create validation dataset:

- 30% validation (10777)


Create train dataset:

- 70% validation (25147)

In [6]:
images_path = VALIDATION_IMAGES_PATH
annotations_path = VALIDATION_ANNOTATIONS_CSV_PATH
output_mode = "val_0.1"


annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])
image_ids = annotations.index.unique()

In [7]:
# !pip install scikit-learn
from sklearn.model_selection import train_test_split


train_image_ids, val_image_ids = train_test_split(image_ids, test_size=0.1)

In [8]:
coco_images = []
coco_annotations = []


for image_id in val_image_ids:
    img = Image.open(images_path / "{}.jpg".format(image_id))
    
    if max(img.size) > 2000 or min(img.size) < 100:
        continue
    
    image_info = {
            "id": image_id,
            "file_name": "{}.jpg".format(image_id),
            "width": img.size[0],
            "height": img.size[1],
    }    
    bboxes, labels, meta = get_bboxes_labels_meta(img.size, image_id)
    
    if len(bboxes) == 0:
        print("No bboxes for image_id '{}'".format(image_id))
        continue

    coco_images.append(image_info)    
    for i, (bbox, label, m) in enumerate(zip(bboxes, labels, meta)):
        m = [int(v) for v in m]
        annotation_id = hash(image_id + "_{}".format(i))
        annotation_info = {
            "id": annotation_id,
            "image_id": image_id,
            "category_id": categories[label],
            "IsOccluded": m[0],
            "IsTruncated": m[1],
            "iscrowd": m[2] if not ignore_is_crowd else 0,
            "IsDepiction": m[3],
            "IsInside": m[4],            
            "area": int(compute_area(bbox)),
            "bbox": [int(v) for v in bbox.tolist()],
            "segmentation": [],
        } 
        coco_annotations.append(annotation_info)    

In [9]:
len(coco_images), len(coco_annotations)

(3587, 21081)

In [10]:
output_coco_annotations = {
    "categories": coco_categories,
    "images": coco_images,
    "annotations": coco_annotations
}

In [11]:
output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
if not output_folder.exists():
    output_folder.mkdir(parents=True)

In [12]:
with open((output_folder / "{}.json".format(output_mode)).as_posix(), 'w') as h:
    json.dump(output_coco_annotations, h)

In [13]:
output_mode = "val_0.9"

In [14]:
coco_images = []
coco_annotations = []


for image_id in train_image_ids:
    img = Image.open(images_path / "{}.jpg".format(image_id))
    
    if max(img.size) > 2000 or min(img.size) < 100:
        continue
    
    image_info = {
            "id": image_id,
            "file_name": "{}.jpg".format(image_id),
            "width": img.size[0],
            "height": img.size[1],
    }    
    bboxes, labels, meta = get_bboxes_labels_meta(img.size, image_id)
    
    if len(bboxes) == 0:
        print("No bboxes for image_id '{}'".format(image_id))
        continue

    coco_images.append(image_info)    
    for i, (bbox, label, m) in enumerate(zip(bboxes, labels, meta)):
        m = [int(v) for v in m]
        annotation_id = hash(image_id + "_{}".format(i))
        annotation_info = {
            "id": annotation_id,
            "image_id": image_id,
            "category_id": categories[label],
            "IsOccluded": m[0],
            "IsTruncated": m[1],
            "iscrowd": m[2] if not ignore_is_crowd else 0,
            "IsDepiction": m[3],
            "IsInside": m[4],            
            "area": int(compute_area(bbox)),
            "bbox": [int(v) for v in bbox.tolist()],
            "segmentation": [],
        } 
        coco_annotations.append(annotation_info)    

In [15]:
len(coco_images), len(coco_annotations)

(32286, 183352)

In [16]:
output_coco_annotations = {
    "categories": coco_categories,
    "images": coco_images,
    "annotations": coco_annotations
}

In [17]:
output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
if not output_folder.exists():
    output_folder.mkdir(parents=True)

In [18]:
with open((output_folder / "{}.json".format(output_mode)).as_posix(), 'w') as h:
    json.dump(output_coco_annotations, h)

In [19]:
output_mode = "val"

In [20]:
coco_images = []
coco_annotations = []


for image_id in image_ids:
    img = Image.open(images_path / "{}.jpg".format(image_id))
    
    if max(img.size) > 2000 or min(img.size) < 100:
        continue
    
    image_info = {
            "id": image_id,
            "file_name": "{}.jpg".format(image_id),
            "width": img.size[0],
            "height": img.size[1],
    }    
    bboxes, labels, meta = get_bboxes_labels_meta(img.size, image_id)
    
    if len(bboxes) == 0:
        print("No bboxes for image_id '{}'".format(image_id))
        continue

    coco_images.append(image_info)    
    for i, (bbox, label, m) in enumerate(zip(bboxes, labels, meta)):
        m = [int(v) for v in m]
        annotation_id = hash(image_id + "_{}".format(i))
        annotation_info = {
            "id": annotation_id,
            "image_id": image_id,
            "category_id": categories[label],
            "IsOccluded": m[0],
            "IsTruncated": m[1],
            "iscrowd": m[2] if not ignore_is_crowd else 0,
            "IsDepiction": m[3],
            "IsInside": m[4],            
            "area": int(compute_area(bbox)),
            "bbox": [int(v) for v in bbox.tolist()],
            "segmentation": [],
        } 
        coco_annotations.append(annotation_info)    

In [21]:
len(coco_images), len(coco_annotations)

(35873, 204433)

In [22]:
output_coco_annotations = {
    "categories": coco_categories,
    "images": coco_images,
    "annotations": coco_annotations
}

In [23]:
output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
if not output_folder.exists():
    output_folder.mkdir(parents=True)

In [24]:
with open((output_folder / "{}.json".format(output_mode)).as_posix(), 'w') as h:
    json.dump(output_coco_annotations, h)

Create train dataset:

- test

In [25]:
images_path = TEST_IMAGES_PATH
annotations_path = TEST_ANNOTATIONS_CSV_PATH
output_mode = "test"

annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])
image_ids = annotations.index.unique()

In [26]:
coco_images = []
coco_annotations = []


for image_id in image_ids:    
    img = Image.open(images_path / "{}.jpg".format(image_id))
    
    if max(img.size) > 2000 or min(img.size) < 100:
        continue
    
    image_info = {
            "id": image_id,
            "file_name": "{}.jpg".format(image_id),
            "width": img.size[0],
            "height": img.size[1],
    }    
    bboxes, labels, meta = get_bboxes_labels_meta(img.size, image_id)
    
    if len(bboxes) == 0:
        print("No bboxes for image_id '{}'".format(image_id))
        continue

    coco_images.append(image_info)    
    for i, (bbox, label, m) in enumerate(zip(bboxes, labels, meta)):
        m = [int(v) for v in m]
        annotation_id = hash(image_id + "_{}".format(i))
        annotation_info = {
            "id": annotation_id,
            "image_id": image_id,
            "category_id": categories[label],
            "IsOccluded": m[0],
            "IsTruncated": m[1],
            "iscrowd": m[2] if not ignore_is_crowd else 0,
            "IsDepiction": m[3],
            "IsInside": m[4],            
            "area": int(compute_area(bbox)),
            "bbox": [int(v) for v in bbox.tolist()],
            "segmentation": [],
        } 
        coco_annotations.append(annotation_info)    



In [27]:
len(coco_images), len(coco_annotations)

(107988, 624169)

In [28]:
output_coco_annotations = {
    "categories": coco_categories,
    "images": coco_images,
    "annotations": coco_annotations
}


output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
if not output_folder.exists():
    output_folder.mkdir(parents=True)
    
    
with open((output_folder / "{}.json".format(output_mode)).as_posix(), 'w') as h:
    json.dump(output_coco_annotations, h)    

Create train dataset to check overfitting

- 10 images from test

In [29]:
images_path = TEST_IMAGES_PATH
annotations_path = TEST_ANNOTATIONS_CSV_PATH
output_mode = "train_overfit"

annotations = pd.read_csv(annotations_path, index_col="ImageID")
annotations['LabelName'] = annotations['LabelName'].map(labels_description.set_index(0)[1])
image_ids = annotations.index.unique()
image_ids = image_ids[:10]

In [30]:
coco_images = []
coco_annotations = []


for image_id in image_ids:    
    img = Image.open(images_path / "{}.jpg".format(image_id))
    
    if max(img.size) > 2000 or min(img.size) < 100:
        continue
    
    image_info = {
            "id": image_id,
            "file_name": "{}.jpg".format(image_id),
            "width": img.size[0],
            "height": img.size[1],
    }    
    bboxes, labels, meta = get_bboxes_labels_meta(img.size, image_id)
    
    if len(bboxes) == 0:
        print("No bboxes for image_id '{}'".format(image_id))
        continue

    coco_images.append(image_info)    
    for i, (bbox, label, m) in enumerate(zip(bboxes, labels, meta)):
        m = [int(v) for v in m]
        annotation_id = hash(image_id + "_{}".format(i))
        annotation_info = {
            "id": annotation_id,
            "image_id": image_id,
            "category_id": categories[label],
            "IsOccluded": m[0],
            "IsTruncated": m[1],
            "iscrowd": m[2] if not ignore_is_crowd else 0,
            "IsDepiction": m[3],
            "IsInside": m[4],            
            "area": int(compute_area(bbox)),
            "bbox": [int(v) for v in bbox.tolist()],
            "segmentation": [],
        } 
        coco_annotations.append(annotation_info)    

In [31]:
len(coco_images), len(coco_annotations)

(10, 68)

In [32]:
output_coco_annotations = {
    "categories": coco_categories,
    "images": coco_images,
    "annotations": coco_annotations
}


output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 
if not output_folder.exists():
    output_folder.mkdir(parents=True)
    
    
with open((output_folder / "{}.json".format(output_mode)).as_posix(), 'w') as h:
    json.dump(output_coco_annotations, h)    

Create symlinks

In [None]:
output_mode = "val"

In [93]:
output_images_folder = Path(".").resolve().parent / "input" / "as_mscoco" / output_mode
if not output_images_folder.exists():
    output_images_folder.symlink_to(images_path, target_is_directory=True)

In [None]:
output_mode = "test"

In [93]:
output_images_folder = Path(".").resolve().parent / "input" / "as_mscoco" / output_mode
if not output_images_folder.exists():
    output_images_folder.symlink_to(images_path, target_is_directory=True)

In [None]:
output_mode = "train_overfit"

In [25]:
output_images_folder = Path(".").resolve().parent / "input" / "as_mscoco" / output_mode

if not output_images_folder.exists():
    output_images_folder.mkdir()

for image_id in image_ids:
    !ln -s {images_path.as_posix()}/{image_id}.jpg {output_images_folder}/{image_id}.jpg 


In [26]:
!ls {output_images_folder}

000026e7ee790996.jpg  0002ab0af02e4a77.jpg  00045d609ca3f4eb.jpg
000062a39995e348.jpg  0002cc8afaf1b611.jpg  00068d5450f0358b.jpg
0000c64e1253d68f.jpg  0003d84e0165d630.jpg
000132c20b84269b.jpg  000411001ff7dd4f.jpg


Test with pycocotools

In [8]:
from pycocotools import coco

In [10]:
output_folder = Path(".").resolve().parent / "input" / "as_mscoco" / "annotations" 

coco = coco.COCO((output_folder / "val.json").as_posix())

loading annotations into memory...
Done (t=0.37s)
creating index...
index created!


In [11]:
anns = coco.getAnnIds()

In [12]:
len(anns), anns[:2]

(60806, ['1e45fc409ab318ab_0', '1e45fc409ab318ab_1'])

In [13]:
imgs = coco.getImgIds()

In [14]:
len(imgs), imgs[:2]

(10759, ['5840d582ce4fbe93', '21494d2aaaf0d2c1'])

In [16]:
coco.loadImgs(['5840d582ce4fbe93', ])

[{'file_name': '5840d582ce4fbe93.jpg',
  'height': 683,
  'id': '5840d582ce4fbe93',
  'width': 1024}]