In [1]:
import os
import yaml
import urllib
from PIL import Image
from enum import Enum
from pycocotools.coco import COCO

import xml.etree.cElementTree as ET
import glob
import argparse
import numpy as np
import json
import numpy
import cv2
from collections import OrderedDict
import scipy.misc
from skimage import measure   
from shapely.geometry import Polygon, MultiPolygon, MultiPoint
import random
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
import shutil
import pickle
import pandas as pd

BASE_DIR = '/media/dean/datastore1/datasets/BerkeleyDeepDrive/'
WORKING_DIR = os.path.join(BASE_DIR, 'scalabel/darknet/')
IMAGE_LIST_DIR = os.path.join(BASE_DIR, 'bdd100k/images/100k/train/image_list.yml')
LABEL_LIST_DIR = os.path.join(BASE_DIR, 'bdd100k/labels/bdd100k_labels_images_train.json')
COCO_DIRECTORY = os.path.join(WORKING_DIR, 'data/coco')
DARKNET_TRAINING_DIR = os.path.join('/media/dean/datastore1/datasets/darknet/data/coco/images/train2014')
img_prefix = 'COCO_train2014_0000'
DEFAULT_IMG_EXTENSION = '.jpg'

BDD100K_ANNOTATIONS_FILE = os.path.join(COCO_DIRECTORY,'annotations/bdd100k_instances_train2014.json')
SCALABEL_FORMAT_ANNOTATIONS = os.path.join(COCO_DIRECTORY,'annotations/s3_bdd100k_scalabel.json')
SCALABEL_FORMAT_ANNOTATIONS_PT2 = os.path.join(COCO_DIRECTORY,'annotations/s3_bdd100k_scalabel_pt2.json')
SCALABEL_FORMAT_ANNOTATIONS_PT3 = os.path.join(COCO_DIRECTORY,'annotations/s3_bdd100k_scalabel_pt3.json')
S3_BUCKET = 'kache-scalabel/bdd100k/images/100k/train/'

In [2]:
def maybe_download(source_url, filename):
    os.makedirs(DARKNET_TRAINING_DIR, exist_ok = True)
    filepath = os.path.join(DARKNET_TRAINING_DIR, filename)
    if os.path.exists(source_url) and not os.path.exists(filepath):
        # Copy image into training directory
        print('Copying File', source_url, 'to file:', filepath)
        shutil.copyfile(source_url, filepath)
    elif not os.path.exists(filepath):
        filepath, _ = urllib.request.urlretrieve(source_url, filepath)
        statinfo = os.stat(filepath)
    return filepath

In [3]:
class Format(Enum):
    scalabel = 0
    coco = 1
    darknet = 2
    bdd = 3
    vgg = 4

In [4]:
class Dataset(object):
    def __init__(self, annotations_list, s3_bucket = None, image_list = None, data_format=Format.scalabel, output_path=WORKING_DIR, pickle_file = None):
        self._images = {}
        self._annotations = {}
        self.s3_bucket = s3_bucket
        
        # Check if pickle_file is None or does not exist
        if pickle_file and os.path.exists(pickle_file):
            self._pickle_file = pickle_file
            pickle_in = open(self._pickle_file,"rb")
            pickle_dict = pickle.load(pickle_in)
            self._images = pickle_dict['images']
            self._annotations = pickle_dict['annotations']
        else:
            path = os.path.normpath(image_list)
            self._pickle_file = "{}.pickle".format('_'.join(path.split(os.sep)[5:]))
        
            # Scalabel Data Handler 
            if data_format == Format.scalabel:
                with open(image_list, 'r') as stream:
                    image_data = yaml.load(stream)
                    if image_data:
                        for img in image_data:
                            img_url = img['url']
                            fname = os.path.split(img_url)[-1]
                            full_path = maybe_download(img_url, img_prefix+fname)
                            if s3_bucket:
                                self.send_to_s3(os.path.join(DARKNET_TRAINING_DIR, img_prefix+fname))
                                
                            im = Image.open(full_path)
                            width, height = im.size
                            self._images[img_prefix+fname] = {'url': img_url, 'coco_path': full_path,
                                                 'width': width, 'height': height}


                # Import Labels            
                with open(annotations_list, 'r') as f:
                    data = json.load(f)

                    for ann in data:
                        fname = os.path.split(ann['url'])[-1]
                        self._annotations[img_prefix+fname] = ann['labels']
                        img_data = self._images[img_prefix+fname]
                        img_data['attributes'] = ann['attributes']
                        img_data['videoName'] = ann['videoName']
                        img_data['timestamp'] = ann['timestamp']
                        img_data['index'] = ann['index']
                        
                        self._images[img_prefix+fname] = img_data

                        
            # BDD100K Data Handler 
            elif data_format == Format.bdd:
                with open(image_list, 'r') as stream:
                    image_data = yaml.load(stream)
                    start_idx = int(1e6)
                    if image_data:
                        for idx, img in enumerate(image_data, start=start_idx):
                            img_url = img['url']
                            fname = os.path.split(img_url)[-1]
                            full_path = maybe_download(img_url, img_prefix+fname)
                            im = Image.open(full_path)
                            width, height = im.size
                            
                            if s3_bucket:
                                img_url = self.send_to_s3(os.path.join(DARKNET_TRAINING_DIR, fname))
                                
                            self._images[img_prefix+fname] = {'url': img_url, 'coco_path': full_path,
                                                              'width': width, 'height': height, 'labels': [], 
                                                              'index': idx, 'timestamp': 10000}
                    print('Image Length:', len(self._images))
                # Get labels
                with open(annotations_list, 'r') as f:
                    data = json.load(f)
                    ann_idx = 0
                    for img_label in data:
                        fname = img_label['name']
                        img_key = img_prefix+fname
                        self._annotations[img_key] = []
                        img_data = self._images[img_key]
                        img_data['attributes'] = img_label.get('attributes', None)

                        for label in img_label['labels']:
                            label['id'] = ann.get('id', ann_idx)
                            label['manual'] =  ann.get('manualShape', True)
                            label['manualAttributes'] = ann.get('manualAttributes', True)
                            label['poly2d'] = ann.get('poly2d', None)
                            label['box3d'] = ann.get('box3d', None)
                            label['box2d'] = ann['box2d']

                            label['category'] = ann['category']

                            img_data['labels'].append(label)
                            ann_idx +=1

                        self._images[img_key] = img_data
                        self._annotations[img_key].extend(img_data['labels'])

            
            # VGG Data Handler (Legacy system)
            elif data_format == Format.vgg:
                HEADER_ROW=['filename', 'file_size', 'file_attributes', 'region_count', 'region_id', 'region_shape_attributes', 'region_attributes']
                vgg_annotations = pd.read_csv(annotations_list, names=HEADER_ROW, skiprows=1)
                img_paths = sorted(set(vgg_annotations['filename'].tolist()))

                num_imgs = len(img_paths)
                ann_idx = int(5e6)

                # loop through each image
                urlstofilepaths = {}
                img = {}
                start_idx = int(1e6)
                for idx, img_url in enumerate(img_paths, start=start_idx):
                    img = {}
                    # Download Image if not exist
                    fname = '_'.join(img_url.split('/')[-2:])
                    urlstofilepaths[img_url] = maybe_download(img_url, os.path.join(DARKNET_TRAINING_DIR, img_prefix+fname))
                    if s3_bucket:
                        s3_path = self.send_to_s3(urlstofilepaths[img_url])
                    
                    # Get Image Size in Bytes
                    img_file_size =  os.stat(urlstofilepaths[img_url]).st_size
                    img['name'] = img_prefix+fname
                    img['url'] = s3_path
                    img['videoName'] = ''
                    img['file_size'] = img_file_size
                    img['index'] = idx
                    img['timestamp'] = 10000                    
                    img['labels'] = []
                    img['attributes'] = {'weather': 'clear',
                                         'scene': 'highway',
                                         'timeofday': 'night'}                    
                    self._images[img_prefix+fname] = img
                    self._annotations[img_prefix+fname] = []
                    
                    for annotation in [x for x in vgg_annotations.as_matrix() if x[0].lower() == img_url.lower()]:
                        ann = {}
                        ann['id'] = ann_idx
                        ann['attributes'] = {'Occluded': False, 'Truncated': False}
                        ann['manual'] = True
                        ann['poly2d'] = None
                        ann['box3d'] = None
                        ann['box2d'] = None
                        d = ast.literal_eval(annotation[5])
        
                        if d:
                            if float(d['x']) < 0.0:
                                d['x'] = 0.0
                            if float(d['y']) < 0.0:
                                d['y'] = 0.0
                            if float(d['height']) <= 0.0:
                                d['height'] = 1.0

                            if float(d['width']) <= 0.0:
                                d['width'] = 1.0   
                
                            ann['box2d'] = {'x1': d['x'],
                                            'x2': d['x'] + d['width'],
                                            'y1': d['y'],
                                            'y2': d['y'] + d['height']}
                        
                        
                        cls = ast.literal_eval(annotation[6])
                        cat = None
                        if cls:
                            cat = cls['type'].lower().strip()
                        if not cat or cat == '' or cat == 'fire hydrant':
                            continue
                        elif cat == 'tlr':
                            ann['attributes']['Traffic Light Color'] = [2, 'R']
                            ann['category'] = 'traffic light'
                        elif cat == 'tlg':
                            ann['attributes']['Traffic Light Color'] = [1, 'G']
                            ann['category'] = 'traffic light'
                        elif cat == 'tla':
                            ann['attributes']['Traffic Light Color'] = [3, 'Y']
                            ann['category'] = 'traffic light'
                        elif cat == 'tlna' or cat == 'traffic light':
                            ann['attributes']['Traffic Light Color'] = [0, 'NA']
                            ann['category'] = 'traffic light'
                        elif cat == 'motorbike':
                            ann['category'] = 'motor bike'
                        elif cat == 'speedlimitsign' or cat == 'stop sign' or cat == 'cone' or cat == 'clock':
                            cat = 'traffic sign'
                        elif cat not in category_names:
                            continue
                        else: # Verify category exists
                            ann['category'] =  ids2cats[cats2ids[cat]]
                            
                        
                        img['labels'].append(ann)
                        ann_idx += 1
                    self._annotations[img_prefix+fname].extend(img['labels'])
                        
                        
            # Save object to picklefile
            pickle_dict = {'images':self._images,'annotations':self._annotations}
            with open(self._pickle_file,"wb") as pickle_out:
                pickle.dump(pickle_dict, pickle_out)            
            
        print(len(self._images))
    
    def send_to_s3(self, img_path):
        s3_path = os.path.join(self.s3_bucket,os.path.split(img_path)[-1])
        
#         exists = !aws s3 ls $s3_path
#         if not exists:
#             s3_bucket = 's3://'+self.s3_bucket
#             res = !aws s3 cp $img_path $s3_bucket
#             print(res)
        return os.path.join('https://s3-us-west-2.amazonaws.com', s3_path)


SyntaxError: invalid syntax (<ipython-input-4-87f35a926dfd>, line 83)

In [None]:
example_set = Dataset(image_list = IMAGE_LIST_DIR, annotations_list = LABEL_LIST_DIR, data_format = Format.bdd, s3_bucket=S3_BUCKET)

In [None]:
with open(SCALABEL_FORMAT_ANNOTATIONS, 'w') as output_json_file:
    imgs_list = list(example_set._images.values())[:250]
    json.dump(imgs_list, output_json_file)
# with open(SCALABEL_FORMAT_ANNOTATIONS_PT2, 'w') as output_json_file:
#     imgs_list = list(example_set._images.values())[25001:50000]
#     json.dump(imgs_list, output_json_file)
# with open(SCALABEL_FORMAT_ANNOTATIONS_PT3, 'w') as output_json_file:
#     imgs_list = list(example_set._images.values())[50001:]
#     json.dump(imgs_list, output_json_file)

In [None]:
night_images = [example_set._images[img] for img in example_set._images if 'night' in example_set._images[img]['attributes']['timeofday']]
print('There are {} night images in this dataset.'.format(len(night_images)))
print(night_images[:10])

In [None]:
BDD100K_LABELS_PATH = os.path.join('/media/dean/datastore1/datasets/Scripts/','BDD100k_Classes.csv')
BDD100K_HEADER_ROW = ['class', 'super-category', 'special', 'description']

In [None]:
# Get RoadCOCO Labels to Use as Ground Truth
gt_labels = pd.read_csv(BDD100K_LABELS_PATH, names=BDD100K_HEADER_ROW, skiprows=1)
gt_labels.head()

In [None]:
# Represent Category IDs using RoadCOCO Labels
cats2ids = {}
for i, label in enumerate(gt_labels['class'].tolist()):
    cats2ids[str(label).lower()] = i
ids2cats = {i: v for v, i in cats2ids.items()}
    


# Build Categories List in MS RoadCOCO Format
categories = [] 
for label in gt_labels.as_matrix():
    category = str(label[0]).lower()
    cat_id = cats2ids[category]
    
    
    
    sup_cat = ids2cats[cats2ids[str(label[1]).lower()]]
    
    categories.append({"id": cat_id, "name": category, "supercategory":sup_cat})   
print (categories)

In [None]:
category_names = [category['name'] for category in categories]
print('Custom BDD100k categories:\n{}\n'.format('\n'.join(category_names)))

In [None]:
images, anns = [], []
img_offset, ann_index = 10000001, 100000000
num_imgs = len(example_set._annotations.keys())
    
for img_id, fname in enumerate(example_set._annotations.keys()):
    width, height = example_set._images[fname]['width'], example_set._images[fname]['height'] 
    
    if not fname.startswith(img_prefix):
        fname = img_prefix+fname
    dic = {'file_name': fname, 'id': img_offset+img_id, 'height': height, 'width': width}
    images.append(dic)
    
    # xy coords: [xstart, ystart, xstop, ystop] -> bbox = [x,y,width,height]
    for annotation in [x for x in example_set._annotations[fname] if x['category'] in category_names]:
        bbox = annotation['box2d']

        if bbox:
            # xy coords: [xstart, ystart, xstop, ystop] -> bbox = [x,y,width,height]
            xstart, ystart, xstop, ystop = float(bbox['x1']),float(bbox['y1']),float(bbox['x2']),float(bbox['y2'])

            if xstart < 0:
                xstart = 0.0
            if ystart < 0:
                ystart = 0.0
            if ystop <= 0:
                ystop = 3.0
            if xstop <= 0:
                xstop = 3.0

            # Get Points from Bounding Box
            pts = []
            pts.append((xstart , xstop))
            pts.append((xstop , ystart))
            pts.append((xstop , ystop))
            pts.append((xstart , ystop))

            segmentations = []
            segmentations.append([])  
            width = xstop - xstart
            height = ystop - ystart
            bbox = (xstart, ystart, width, height)
            area = float(width*height)

            annotation = {
                'segmentation': segmentations,
                'iscrowd': 0,
                'image_id': img_offset+img_id, # Don't want to conflict with existing dataset
                'category_id': cats2ids[annotation['category']],
                'id': ann_index,
                'bbox': bbox,
                'area': area
            }
            ann_index+=1
            anns.append(annotation)
        

In [None]:
print(len(anns))

In [None]:
from datetime import datetime

INFO = {
    "description": "Road Object-Detections Dataset based on MS COCO",
    "url": "https://kache.ai",
    "version": "0.0.1",
    "year": 2018,
    "contributor": "deanwebb",
    "date_created": datetime.utcnow().isoformat(' ')
}

LICENSES = [
    {
        "id": 1,
        "name": "The MIT License (MIT)",
        "url": "https://opensource.org/licenses/MIT",
        "description":  """
                        The MIT License (MIT)
                        Copyright (c) 2017 Matterport, Inc.

                        Permission is hereby granted, free of charge, to any person obtaining a copy
                        of this software and associated documentation files (the "Software"), to deal
                        in the Software without restriction, including without limitation the rights
                        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
                        copies of the Software, and to permit persons to whom the Software is
                        furnished to do so, subject to the following conditions:

                        The above copyright notice and this permission notice shall be included in
                        all copies or substantial portions of the Software.

                        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
                        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
                        THE SOFTWARE.
                        """
    }
]

coco_output = {'info': INFO, 'licenses': LICENSES, 'images':images, 'annotations':anns, 'categories': categories}
with open(BDD10K_ANNOTATIONS_FILE, 'w') as output_json_file:
    json.dump(coco_output, output_json_file)

In [None]:
testing_coco = COCO(BDD10K_ANNOTATIONS_FILE)
category_ids = testing_coco.getCatIds(catNms=list(category_names))
image_ids = testing_coco.getImgIds()
image_data = testing_coco.loadImgs(image_ids[np.random.randint(0, len(image_ids))])[0]
print(image_data)

In [None]:
# load and display instance annotations
image = io.imread(os.path.join(DATACACHE ,image_data['file_name']))
plt.imshow(image); plt.axis('off')
pylab.rcParams['figure.figsize'] = (128.0, 180.0)
annotation_ids = testing_coco.getAnnIds( catIds=category_ids, iscrowd=None)


annotations = testing_coco.loadAnns(annotation_ids)
print(len(annotations))


In [None]:
# Get Dataset Distribution

dataset = {}

for cat in category_ids:
    annotation_ids = testing_coco.getAnnIds(catIds=[cat])
    image_ids = testing_coco.getImgIds(catIds=[cat])
    cat_nm = testing_coco.loadCats(ids=[cat])[0]['name']
    dataset[cat] = (len(annotation_ids), len(image_ids))
    
    print(cat_nm.upper(), '| Annotations:', dataset[cat][0], ' | Images: ',  dataset[cat][1])

In [None]:
# Prepare Annotations for Darknet training
WORKING_DIRECTORY ='/media/dean/datastore1/datasets/darknet_evaluate'
COCO_DIRECTORY = os.path.join(WORKING_DIRECTORY, 'data/coco')
BDD10K_COCO_ANNOTATIONS_FILE = os.path.join(COCO_DIRECTORY, 'annotations', 'bdd10k_instances_train2014.json')
IMAGES_DIRECTORY = os.path.join(COCO_DIRECTORY, 'images', 'train2014')
LABELS_DIRECTORY = os.path.join(COCO_DIRECTORY, 'labels','train2014')
CATEGORY_NAMES = os.path.join(WORKING_DIRECTORY, 'data', 'coco.bdd100k.names')


if not os.path.exists(os.path.join(COCO_DIRECTORY, 'labels/train2014/manifast.txt')):
    yolo_convert_output = os.path.join(COCO_DIRECTORY, 'labels','convert2yolo_results.txt')
    !python3 $WORKING_DIRECTORY/convert2Yolo/example.py --datasets COCO --img_path "{IMAGES_DIRECTORY}" --label "{BDD10K_COCO_ANNOTATIONS_FILE}" --convert_output_path "{LABELS_DIRECTORY}" --img_type [".jpg"] --manipast_path $LABELS_DIRECTORY --cls_list_file $CATEGORY_NAMES &>> $yolo_convert_output
        