In [37]:
import pandas as pd
import random
import urllib3
import os
import shutil
from PIL import Image

In [38]:
def download(url, path):
    http = urllib3.PoolManager()
    r = http.request('GET', url, preload_content=False)

    with open(path, 'wb') as out:
        while True:
            data = r.read(48000)
            if not data:
                break
            out.write(data)

    r.release_conn()

In [43]:
labels = [
    'airplane',
    'car', 
    'horse'
]
labels_ids = [
    '/m/0cmf2', 
    '/m/0k4j', 
    '/m/03k3r'
]
test_count = 10
train_count = 100
min_area = 0.2

In [45]:
# Test Images
# Test images include a segmentation image

segmentation = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/test-annotations-object-segmentation.csv")
bbox = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/test-annotations-bbox.csv")

test_subsets = [segmentation[(segmentation['LabelName']==labels_id)] for labels_id in labels_ids]

test_groups = [[] for _ in range(len(labels))]

for i in range(len(test_subsets)):
    rows = test_subsets[i]
    print(len(rows.index))
    for index, row in rows.iterrows():
        area = (row["BoxXMax"]-row["BoxXMin"]) * (row["BoxYMax"]-row["BoxYMin"])
        if area >= min_area:
            box = bbox[(bbox["ImageID"] == row["ImageID"]) & (bbox["Confidence"] == 1) & (bbox["IsDepiction"] == 0) & (bbox["IsGroupOf"] == 0) & (bbox["IsInside"] == 0) & (bbox["IsTruncated"] == 0)]
            if box.shape[0] == 1:
                test_groups[i].append((row["ImageID"], row["MaskPath"]))

test_groups = [random.sample(g, test_count) for g in test_groups]

for i, m in enumerate(test_groups):
    name = labels[i]
    os.makedirs(f'dataset_{train_count}/test/{name}', exist_ok=True)
    for (image_id, mask_path) in m:
        print(f'http://s3.amazonaws.com/open-images-dataset/test/{image_id}.jpg', f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        download(f'http://s3.amazonaws.com/open-images-dataset/test/{image_id}.jpg', f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        
        pil_image = Image.open(f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        pil_mask = Image.open(f'/Users/racoon/Desktop/open-images-v5-metadata/test-masks-{mask_path[0]}/{mask_path}')
        pil_mask = pil_mask.resize(pil_image.size)
        pil_mask.save(f'dataset_{train_count}/test/{name}/{image_id}.mask.png')

852
850
860
http://s3.amazonaws.com/open-images-dataset/test/0ee3373f0e9e0dce.jpg dataset_100/test/airplane/0ee3373f0e9e0dce.jpg
http://s3.amazonaws.com/open-images-dataset/test/e715de66587cedfc.jpg dataset_100/test/airplane/e715de66587cedfc.jpg
http://s3.amazonaws.com/open-images-dataset/test/d6798d59123c83f3.jpg dataset_100/test/airplane/d6798d59123c83f3.jpg
http://s3.amazonaws.com/open-images-dataset/test/4fbc98f9ca7e8c2b.jpg dataset_100/test/airplane/4fbc98f9ca7e8c2b.jpg
http://s3.amazonaws.com/open-images-dataset/test/d13c901cb397c0d5.jpg dataset_100/test/airplane/d13c901cb397c0d5.jpg
http://s3.amazonaws.com/open-images-dataset/test/e419cbc6e9d6f6ad.jpg dataset_100/test/airplane/e419cbc6e9d6f6ad.jpg
http://s3.amazonaws.com/open-images-dataset/test/5ed121557f6fe96b.jpg dataset_100/test/airplane/5ed121557f6fe96b.jpg
http://s3.amazonaws.com/open-images-dataset/test/dc2b372bb1051e14.jpg dataset_100/test/airplane/dc2b372bb1051e14.jpg
http://s3.amazonaws.com/open-images-dataset/test/147

In [41]:
# Train Images
validation = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/validation-annotations-bbox.csv")
subsets = [validation[(validation['LabelName']==labels_id) & (validation["Confidence"] == 1) & (validation["IsDepiction"] == 0) & (validation["IsGroupOf"] == 0) & (validation["IsInside"] == 0) & (bbox["IsTruncated"] == 0)] for labels_id in labels_ids]
other_rows = validation[(~validation['LabelName'].isin(labels_ids)) & (validation["Confidence"] == 1)]

candidates = [[] for _ in range(len(labels)+1)]
u = set()

for i in range(len(subsets)):
    rows = subsets[i]
    for index, row in rows.iterrows():
        area = (row["XMax"]-row["XMin"]) * (row["YMax"]-row["YMin"])
        if area >= min_area:
            u.add(row["ImageID"])
            candidates[i].append(row["ImageID"])

while len(candidates[-1]) < n:        
    row = other_rows.sample().iloc[0]

    if row["ImageID"] not in u:
        candidates[-1].append(row["ImageID"])

members = [random.sample(c, train_count) for c in candidates]

for i, m in enumerate(members):
    name = labels[i] if i < len(subsets) else 'other'
    os.makedirs(f'dataset_{train_count}/train/{name}', exist_ok=True)
    for id in m:
        print(f'http://s3.amazonaws.com/open-images-dataset/validation/{id}.jpg', f'dataset_{train_count}/train/{name}/{id}.jpg')
        download(f'http://s3.amazonaws.com/open-images-dataset/validation/{id}.jpg', f'dataset_{train_count}/train/{name}/{id}.jpg') 

http://s3.amazonaws.com/open-images-dataset/validation/9d408f91e7f2842e.jpg dataset_100/train/airplane/9d408f91e7f2842e.jpg
http://s3.amazonaws.com/open-images-dataset/validation/425eaf0825070fd0.jpg dataset_100/train/airplane/425eaf0825070fd0.jpg
http://s3.amazonaws.com/open-images-dataset/validation/7496ff4511675481.jpg dataset_100/train/airplane/7496ff4511675481.jpg
http://s3.amazonaws.com/open-images-dataset/validation/2425a29e0ebe8425.jpg dataset_100/train/airplane/2425a29e0ebe8425.jpg
http://s3.amazonaws.com/open-images-dataset/validation/84a6467749c66076.jpg dataset_100/train/airplane/84a6467749c66076.jpg
http://s3.amazonaws.com/open-images-dataset/validation/2784564e72e0a9ea.jpg dataset_100/train/airplane/2784564e72e0a9ea.jpg
http://s3.amazonaws.com/open-images-dataset/validation/73f1c2451f29f739.jpg dataset_100/train/airplane/73f1c2451f29f739.jpg
http://s3.amazonaws.com/open-images-dataset/validation/28540fe4ba361e1c.jpg dataset_100/train/airplane/28540fe4ba361e1c.jpg
http://s