# Clean and combine COCO datasets

In [1]:
import os
import json
import pandas as pd

### Functions

In [86]:
def print_category_counts(cct):
    '''
    Prints a dataframe in which each row includes a category and its respective
    number of annotations

    Args:
        cct (dict): A COCO for Cameratraps JSON file parsed into dict

    Returns:
        n/a
    '''
    category_map = {cct['categories'][i]['id']:cct['categories'][i]['name'] for i in range(len(cct['categories']))}
    anns_df = pd.DataFrame(cct['annotations'])
    counts = anns_df.groupby(['category_id']).size().reset_index(name='counts')
    counts['category_name'] = counts['category_id'].map(category_map)
    print(counts)


def clean_cct(cct, categories):
    '''
    Clean and filter camera trap annotations in COCO for Cameratraps format
    Filters out annotations (and corresponding images) that:
      - don't have bounding boxes
      - categories aren't included in list of selected categories
      - the bbox is too large (>95% of width OR >95% height of image)

    Args:
        cct (dict): A COCO for Cameratraps JSON file parsed into dict
        categories (list): A list of category IDs to keep

    Returns:
        A tuple composed of a cleaned, filtered cct dict
        and a dict of rejects (key is reason, values are arrays of file names)
    '''
    print(f'No. annotations BEFORE cleaning: {len(cct["annotations"])}')

    image_lookup = {img['id']: img for img in cct['images']}

    anns_to_keep = []
    images_to_keep = []
    categories_to_keep = list(filter(lambda cat: cat['id'] in categories, cct['categories']))
    rejects = {
        'bad_category': [],
        'no_bbox': [],
        'bbox_too_big': [],
    }

    for annotation in cct['annotations']:
        image_id = annotation['image_id']
        image = image_lookup.get(image_id)
        if image is not None:
            bbox = annotation.get('bbox')
            width, height = image['width'], image['height']
            if annotation.get('category_id') not in categories:
                rejects['bad_category'].append(image['file_name'])
                continue
            if bbox is None:
                rejects['no_bbox'].append(image['file_name'])
                continue
            if bbox[2] >= 0.95 * width or bbox[3] >= 0.95 * height:
                rejects['bbox_too_big'].append(image['file_name'])
                continue
            anns_to_keep.append(annotation)
            images_to_keep.append(image)

    # de-dupe images (b/c images to annotations is a one-to-many relationship)
    images_df = pd.DataFrame(images_to_keep).drop_duplicates()
    images_to_keep = images_df.to_dict('records')

    print(f'No. annotations AFTER cleaning: {len(anns_to_keep)}')
    print(f'No. images AFTER cleaning: {len(images_to_keep)}')
    for reason, anns in rejects.items():
        print(f'  - {len(anns)} annotations had {reason}')

    return {
      'info': cct['info'],
      'images': images_to_keep,
      'annotations': anns_to_keep,
      'categories': categories_to_keep
    }, rejects

### Animl Dataset (SCI Biosecurity)

In [3]:
animl_cct_annotations_path = "/home/nrindlaub/classifier-training/mdcache/v5.0a/sci-biosecurity_cct.json"

In [4]:
with open(animl_cct_annotations_path, 'r') as f:
    animl = json.load(f)

In [90]:
print_category_counts(animl)

   category_id  counts category_name
0            0       4         empty
1            7       3        animal
2            9   16922          bird
3           10    7821        lizard
4           13   11833           fox
5           14     195           rat
6           18       1         mouse
7           20   30590        rodent
8           21    2736     scrub jay
9           22    1289         skunk


#### Dataset-specific cleaning
For fixing bad CCT formatting or enriching CCT 
(e.g. parsing `image['file_name']` to derive `image['location']`; modifying catefory IDs)

In [8]:
# animl CCT outputs its path in an "original_relative_path" field which is not
# part of the CCT spec, so copy those path values to "file_name" for consistency

def update_file_name(img):
    img['file_name'] = img['original_relative_path']
    img.pop('original_relative_path')
    return img

animl['images'] = list(map(update_file_name, animl['images']))

In [10]:
animl['images'][0]

{'id': '3422ea65783a5775971eec489105eebe',
 'file_name': 'X811459F/p_000020_3422ea65783a5775971eec489105eebe.jpg',
 'datetime': '2021-04-03T04:44:20.000Z',
 'location': 'Walnut drainage',
 'width': 2048,
 'height': 1536}

In [31]:
animl_categories_to_keep = [9, 10, 13, 14, 20, 21, 22]
animl_clean, rejects = clean_cct(animl, animl_categories_to_keep)

No. annotations BEFORE cleaning: 71394
No. annotations AFTER cleaning: 71284
No. images AFTER cleaning: 69416
  - 0 annotations had no_bbox
  - 102 annotations had bbox_too_big
  - 8 annotations had bad_category


### LILA Data ('rats' from Island Conservation Cameratraps dataset)

In [38]:
ic_cct_annotations_path = "/home/nrindlaub/images/island/metadata/island_conservation.json"

In [39]:
with open(ic_rats_cct_annotations_path, 'r') as f:
    ic = json.load(f)

In [92]:
print_category_counts(ic)

    category_id  counts               category_name
0             0   77670                       empty
1             1     150                         cow
2             2     845                      donkey
3             3    6284                      iguana
4             4     967                       raven
5             5    4737                         cat
6             6     160                         dog
7             7       3                         NaN
8             8    6237                       human
9             9     967                     unknown
10           10     232           white-winged_dove
11           11     147                        bird
12           12      38                   passerine
13           13       1                         owl
14           14   16341               burrowing_owl
15           15       1                  barred_owl
16           16       6                 green_heron
17           17       9            american_kestrel
18          

#### Dataset-specific cleaning
For fixing bad CCT formatting or enriching CCT 

e.g. parsing `image['file_name']` to derive `image['location']`

In [52]:
ic_categories_to_keep = [7]  # just rats
ic_clean, rejects = clean_cct(ic, ic_categories_to_keep)

No. annotations BEFORE cleaning: 142341
No. annotations AFTER cleaning: 16335
No. images AFTER cleaning: 6883
  - 126003 annotations had bad_category
  - 0 annotations had no_bbox
  - 3 annotations had bbox_too_big


In [53]:
# parse IC's image['file_name'] to derive image['location']
for img in ic_clean['images']:
    path = img['file_name'].split('/')[0:2]
    path = '/'.join(path)
    img['location'] = path

ic_clean['images'][0]


{'id': 'dominicanrepublic_camara116_cam11618mayo2017b_dominicanrepublic_cam11618mayo2017b_20170126_033638_img_0090',
 'file_name': 'dominicanrepublic/camara116/cam11618mayo2017b/dominicanrepublic_cam11618mayo2017b_20170126_033638_img_0090.jpg',
 'width': 1920,
 'height': 1080,
 'location': 'dominicanrepublic/camara116'}

In [56]:
# convert IC rat category ID to Animl's
for ann in ic_clean['annotations']:
    if ann['category_id'] == 7: # rat (IC)
        ann['category_id'] = 14 # fox (Animl)

for cat in ic_clean['categories']:
    if cat['id'] == 7:
        cat['id'] = 14

ic_clean['annotations'][0]

{'id': '6e80a662-df2b-11ea-820f-000d3a74c7de',
 'image_id': 'dominicanrepublic_camara116_cam11618mayo2017b_dominicanrepublic_cam11618mayo2017b_20170126_033638_img_0090',
 'category_id': 14,
 'bbox': [982.0799999999999, 620.028, 294.912, 74.952]}

In [57]:
ic_clean['categories']

[{'id': 14, 'name': 'rat'}]

### Combine datasets

In [58]:
# merge datasets into one CCT dict
coco_out = {}
coco_out['images'] = animl_clean['images'] + ic_clean['images']
coco_out['annotations'] = animl_clean['annotations'] + ic_clean['annotations']
coco_out['categories'] = animl_clean['categories']
coco_out['info'] = animl_clean['info']

print(f'combined dataset contains {len(coco_out["annotations"])} annotations in {len(coco_out["images"])} images of the following categories: \n {list(map(lambda x: x["name"], coco_out["categories"]))}')

In [89]:
# sanity check
assert len(coco_out["annotations"]) == len(animl_clean["annotations"]) + len(ic_clean["annotations"])
assert len(coco_out["images"]) == len(animl_clean["images"]) + len(ic_clean["images"])
print_category_counts(coco_out)

   category_id  counts category_name
0            9   16922          bird
1           10    7821        lizard
2           13   11733           fox
3           14   16530           rat
4           20   30590        rodent
5           21    2734     scrub jay
6           22    1289         skunk


In [60]:
# save as json file
with open('/home/nrindlaub/classifier-training/mdcache/v5.0a/coco_out.json', 'w') as f:
   json.dump(coco_out, f)
