In [15]:
import json
import os
import torch
import random
import xml.etree.ElementTree as ET
import torchvision.transforms.functional as FT

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Label map
voc_labels = ('with_mask', 'without_mask')
label_map = {k: v + 1 for v, k in enumerate(voc_labels)}
label_map['background'] = 0
rev_label_map = {v: k for k, v in label_map.items()}  # Inverse mapping

# Color map for bounding boxes of detected objects from https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/
distinct_colors = ['#e6194b', '#3cb44b', '#ffe119']
label_color_map = {k: distinct_colors[i] for i, k in enumerate(label_map.keys())}


def parse_annotation(annotation_path):
    tree = ET.parse(annotation_path)
    root = tree.getroot()

    boxes = list()
    labels = list()
    difficulties = list()
    for object in root.iter('object'):

        difficult = int(object.find('difficult').text == '1')

        label = object.find('name').text.lower().strip()
        if label not in label_map:
            continue

        bbox = object.find('bndbox')
        xmin = int(bbox.find('xmin').text) - 1
        ymin = int(bbox.find('ymin').text) - 1
        xmax = int(bbox.find('xmax').text) - 1
        ymax = int(bbox.find('ymax').text) - 1

        boxes.append([xmin, ymin, xmax, ymax])
        labels.append(label_map[label])
        difficulties.append(difficult)

    return {'boxes': boxes, 'labels': labels, 'difficulties': difficulties}


def create_data_lists(train_path, test_path, val_path, output_folder):
    """
    Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file.

    :param train_path: path to training images and annotations
    :param test_path: path to test images and annotations
    :param val_path: path to validation images and annotations
    :param output_folder: folder where the JSONs must be saved
    """
    
    
    train_path = os.path.abspath(train_path)
    print("TRAIN PATH:", train_path)

    train_images = list()
    train_objects = list()
    n_objects = 0

    # Training data
    for path in os.listdir(train_path):

        if path == '.DS_Store':
            continue
        
        path = os.path.join(train_path, path)
        print("PATH:", path)
        
        ann_path = os.path.join(path, 'annotations')
        img_path = os.path.join(path, 'images')

        for xml in os.listdir(ann_path):
            if xml == '.DS_Store':
                continue
            print(xml)
            name = xml[0 : xml.index('.')]
            # Parse annotation's XML file
            objects = parse_annotation(os.path.join(ann_path, xml))
            if len(objects['boxes']) == 0:
                continue
            n_objects += len(objects)
            train_objects.append(objects)
            train_images.append(os.path.join(img_path, name + '.jpg'))

    assert len(train_objects) == len(train_images)

    # Save to file
    with open(os.path.join(output_folder, 'TRAIN_images.json'), 'w') as j:
        json.dump(train_images, j)
    with open(os.path.join(output_folder, 'TRAIN_objects.json'), 'w') as j:
        json.dump(train_objects, j)
    with open(os.path.join(output_folder, 'label_map.json'), 'w') as j:
        json.dump(label_map, j)  # save label map too

    print('\nThere are %d training images containing a total of %d objects. Files have been saved to %s.' % (
        len(train_images), n_objects, os.path.abspath(output_folder)))
    
    # Test data
    test_images = list()
    test_objects = list()
    n_objects = 0
    print("TEST PATH:", test_path)
    
    for path in os.listdir(test_path):

        if path == '.DS_Store':
            continue
        
        path = os.path.join(test_path, path)
        print("PATH:", path)
        
        ann_path = os.path.join(path, 'annotations')
        img_path = os.path.join(path, 'images')

        for xml in os.listdir(ann_path):
            if xml == '.DS_Store':
                continue
            name = xml[0 : xml.index('.')]
            print(xml)
            # Parse annotation's XML file
            objects = parse_annotation(os.path.join(ann_path, xml))
            if len(objects['boxes']) == 0:
                continue
            n_objects += len(objects)
            test_objects.append(objects)
            test_images.append(os.path.join(img_path, name + '.jpg'))

    assert len(test_objects) == len(test_images)

    # Save to file
    with open(os.path.join(output_folder, 'CROWDS_images.json'), 'w') as j:
        json.dump(test_images, j)
    with open(os.path.join(output_folder, 'CROWDS_objects.json'), 'w') as j:
        json.dump(test_objects, j)

    print('\nThere are %d test images containing a total of %d objects. Files have been saved to %s.' % (
        len(test_images), n_objects, os.path.abspath(output_folder)))

   # Validation data
    val_images = list()
    val_objects = list()
    n_objects = 0
    print("VAL PATH:", val_path)

    for path in os.listdir(val_path):

        if path == '.DS_Store':
            continue
        
        path = os.path.join(val_path, path)
        print("PATH:", path)
        
        ann_path = os.path.join(path, 'annotations')
        img_path = os.path.join(path, 'images')

        for xml in os.listdir(ann_path):
            if xml == '.DS_Store':
                continue
            name = xml[0 : xml.index('.')]
            print(xml)
            # Parse annotation's XML file
            objects = parse_annotation(os.path.join(ann_path, xml))
            if len(objects['boxes']) == 0:
                continue
            n_objects += len(objects)
            val_objects.append(objects)
            val_images.append(os.path.join(img_path, name + '.jpg'))

    assert len(val_objects) == len(val_images)

    # Save to file
    with open(os.path.join(output_folder, 'VAL_images.json'), 'w') as j:
        json.dump(val_images, j)
    with open(os.path.join(output_folder, 'VAL_objects.json'), 'w') as j:
        json.dump(val_objects, j)

    print('\nThere are %d validation images containing a total of %d objects. Files have been saved to %s.' % (
        len(val_images), n_objects, os.path.abspath(output_folder)))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

create_data_lists(train_path='/content/drive/My Drive/masked-face-detector/data/train',
                      test_path='/content/drive/My Drive/masked-face-detector/data/test',
                      val_path='/content/drive/My Drive/masked-face-detector/data/validation',
                      output_folder='/content/drive/My Drive/masked-face-detector/output')