In [1]:
import pandas as pd
import random
import urllib3
import os
import shutil
from PIL import Image

In [2]:
def download(url, path):
    http = urllib3.PoolManager()
    r = http.request('GET', url, preload_content=False)

    with open(path, 'wb') as out:
        while True:
            data = r.read(48000)
            if not data:
                break
            out.write(data)

    r.release_conn()

In [3]:
labels = [
    'airplane',
    'car', 
    'horse'
]
labels_ids = [
    '/m/0cmf2', 
    '/m/0k4j', 
    '/m/03k3r'
]
test_count = 10
train_count = 100
min_area = 0.2

In [4]:
# Test Images
# Test images include a segmentation image

segmentation = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/test-annotations-object-segmentation.csv")
bbox = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/test-annotations-bbox.csv")

test_subsets = [segmentation[(segmentation['LabelName']==labels_id)] for labels_id in labels_ids]

test_groups = [[] for _ in range(len(labels))]

for i in range(len(test_subsets)):
    rows = test_subsets[i]
    print(len(rows.index))
    for index, row in rows.iterrows():
        area = (row["BoxXMax"]-row["BoxXMin"]) * (row["BoxYMax"]-row["BoxYMin"])
        if area >= min_area:
            box = bbox[(bbox["ImageID"] == row["ImageID"]) & (bbox["Confidence"] == 1) & (bbox["IsDepiction"] == 0) & (bbox["IsGroupOf"] == 0) & (bbox["IsInside"] == 0) & (bbox["IsTruncated"] == 0)]
            if box.shape[0] == 1:
                test_groups[i].append((row["ImageID"], row["MaskPath"]))

test_groups = [random.sample(g, test_count) for g in test_groups]

for i, m in enumerate(test_groups):
    name = labels[i]
    os.makedirs(f'dataset_{train_count}/test/{name}', exist_ok=True)
    for (image_id, mask_path) in m:
        print(f'http://s3.amazonaws.com/open-images-dataset/test/{image_id}.jpg', f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        download(f'http://s3.amazonaws.com/open-images-dataset/test/{image_id}.jpg', f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        
        pil_image = Image.open(f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        pil_mask = Image.open(f'/Users/racoon/Desktop/open-images-v5-metadata/test-masks-{mask_path[0]}/{mask_path}')
        pil_mask = pil_mask.resize(pil_image.size)
        pil_mask.save(f'dataset_{train_count}/test/{name}/{image_id}.mask.png')

852
850
860
http://s3.amazonaws.com/open-images-dataset/test/fe0ae5dc7e00a8ef.jpg dataset_100/test/airplane/fe0ae5dc7e00a8ef.jpg
http://s3.amazonaws.com/open-images-dataset/test/0ee3373f0e9e0dce.jpg dataset_100/test/airplane/0ee3373f0e9e0dce.jpg
http://s3.amazonaws.com/open-images-dataset/test/ab3f3a7ed32ff541.jpg dataset_100/test/airplane/ab3f3a7ed32ff541.jpg
http://s3.amazonaws.com/open-images-dataset/test/f56252f29c3c7d01.jpg dataset_100/test/airplane/f56252f29c3c7d01.jpg
http://s3.amazonaws.com/open-images-dataset/test/de9f30cf60b071e9.jpg dataset_100/test/airplane/de9f30cf60b071e9.jpg
http://s3.amazonaws.com/open-images-dataset/test/e18e82b74db0b7dd.jpg dataset_100/test/airplane/e18e82b74db0b7dd.jpg
http://s3.amazonaws.com/open-images-dataset/test/78fc8c7e31b963ff.jpg dataset_100/test/airplane/78fc8c7e31b963ff.jpg
http://s3.amazonaws.com/open-images-dataset/test/573ed7b5e31efa7c.jpg dataset_100/test/airplane/573ed7b5e31efa7c.jpg
http://s3.amazonaws.com/open-images-dataset/test/d8e

In [5]:
# Train Images
validation = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/validation-annotations-bbox.csv")
subsets = [validation[(validation['LabelName'] == labels_id) & (validation["Confidence"] == 1) & (validation["IsDepiction"] == 0) & (validation["IsGroupOf"] == 0) & (validation["IsInside"] == 0) & (validation["IsTruncated"] == 0)] for labels_id in labels_ids]
other_rows = validation[(~validation['LabelName'].isin(labels_ids)) & (validation["Confidence"] == 1)]

candidates = [[] for _ in range(len(labels)+1)]
u = set()

for i in range(len(subsets)):
    rows = subsets[i]
    for index, row in rows.iterrows():
        area = (row["XMax"]-row["XMin"]) * (row["YMax"]-row["YMin"])
        if area >= min_area:
            u.add(row["ImageID"])
            candidates[i].append(row["ImageID"])

while len(candidates[-1]) < train_count:        
    row = other_rows.sample().iloc[0]

    if row["ImageID"] not in u:
        candidates[-1].append(row["ImageID"])

members = [random.sample(c, train_count) for c in candidates]

for i, m in enumerate(members):
    name = labels[i] if i < len(subsets) else 'other'
    os.makedirs(f'dataset_{train_count}/train/{name}', exist_ok=True)
    for id in m:
        print(f'http://s3.amazonaws.com/open-images-dataset/validation/{id}.jpg', f'dataset_{train_count}/train/{name}/{id}.jpg')
        download(f'http://s3.amazonaws.com/open-images-dataset/validation/{id}.jpg', f'dataset_{train_count}/train/{name}/{id}.jpg') 

http://s3.amazonaws.com/open-images-dataset/validation/36d2f7a33a9562e1.jpg dataset_100/train/airplane/36d2f7a33a9562e1.jpg
http://s3.amazonaws.com/open-images-dataset/validation/f59901b3f57aeda5.jpg dataset_100/train/airplane/f59901b3f57aeda5.jpg
http://s3.amazonaws.com/open-images-dataset/validation/2bea809efce9d52e.jpg dataset_100/train/airplane/2bea809efce9d52e.jpg
http://s3.amazonaws.com/open-images-dataset/validation/70317bda96f2b16a.jpg dataset_100/train/airplane/70317bda96f2b16a.jpg
http://s3.amazonaws.com/open-images-dataset/validation/bedc6631a0e48ad7.jpg dataset_100/train/airplane/bedc6631a0e48ad7.jpg
http://s3.amazonaws.com/open-images-dataset/validation/2f1d74f018ee3047.jpg dataset_100/train/airplane/2f1d74f018ee3047.jpg
http://s3.amazonaws.com/open-images-dataset/validation/69130d22dfb443e4.jpg dataset_100/train/airplane/69130d22dfb443e4.jpg
http://s3.amazonaws.com/open-images-dataset/validation/751a667898ca6190.jpg dataset_100/train/airplane/751a667898ca6190.jpg
http://s