In [None]:
import pandas as pd
import random
import urllib3
import os
import shutil
from PIL import Image

In [None]:
def download(url, path):
    http = urllib3.PoolManager()
    r = http.request('GET', url, preload_content=False)

    with open(path, 'wb') as out:
        while True:
            data = r.read(48000)
            if not data:
                break
            out.write(data)

    r.release_conn()

In [31]:
labels = [
    'airplane',
    'car', 
    'truck',
    'motorcycle',
    'horse',
    'cat',
    'dog',
    'bird',
    'cake',
    'coffee_cup',
    'bottle',
    'boy',
    'girl',
    'guitar',
    'bus',
    
]
labels_ids = [
    '/m/0cmf2', 
    '/m/0k4j', 
    '/m/07r04',
    '/m/04_sv',
    '/m/03k3r',
    '/m/01yrx',
    '/m/0bt9lr',
    '/m/015p6',
    '/m/0fszt',
    '/m/02p5f1q',
    '/m/04dr76w',
    '/m/01bl7v',
    '/m/05r655',
    '/m/0342h',
    '/m/01bjv'
]
test_count = 100
train_count = 1000
min_area = 0.20

In [None]:
# Test Images
# Test images include a segmentation image

segmentation = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/test-annotations-object-segmentation.csv")
bbox = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/test-annotations-bbox.csv")

test_subsets = [segmentation[(segmentation['LabelName']==labels_id)] for labels_id in labels_ids]
test_groups = [[] for _ in range(len(labels))]

print('len(test_subsets), len(test_groups)', len(test_subsets), len(test_groups))
for i in range(len(test_subsets)):
    rows = test_subsets[i]
    print('len(rows.index)', len(rows.index))
    for index, row in rows.iterrows():
        area = (row["BoxXMax"]-row["BoxXMin"]) * (row["BoxYMax"]-row["BoxYMin"])
        if area >= min_area:
#             box = bbox[(bbox["ImageID"] == row["ImageID"]) & (bbox["Confidence"] == 1) & (bbox["IsDepiction"] == 0) & (bbox["IsGroupOf"] == 0) & (bbox["IsInside"] == 0) & (bbox["IsTruncated"] == 0)]
#             if box.shape[0] == 1:
            test_groups[i].append((row["ImageID"], row["MaskPath"]))
    
    print('len(test_groups[i])', len(test_groups[i]))

test_groups = [random.sample(g, test_count) for g in test_groups]

for i, m in enumerate(test_groups):
    name = labels[i]
    os.makedirs(f'dataset_{train_count}/test/{name}', exist_ok=True)
    for (image_id, mask_path) in m:
        print(f'http://s3.amazonaws.com/open-images-dataset/test/{image_id}.jpg', f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        download(f'http://s3.amazonaws.com/open-images-dataset/test/{image_id}.jpg', f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        
        pil_image = Image.open(f'dataset_{train_count}/test/{name}/{image_id}.jpg')
        pil_mask = Image.open(f'/Users/racoon/Desktop/open-images-v5-metadata/test-masks-{mask_path[0]}/{mask_path}')
        pil_mask = pil_mask.resize(pil_image.size)
        pil_mask.save(f'dataset_{train_count}/test/{name}/{image_id}.mask.png')

In [40]:
# Train Images
validation = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/train-annotations-bbox.csv")
subsets = [validation[(validation['LabelName'] == labels_id) & (validation["Confidence"] == 1) & (validation["IsDepiction"] == 0) & (validation["IsGroupOf"] == 0) & (validation["IsInside"] == 0)] for labels_id in labels_ids]

print('len(subsets)', len(subsets))

label_image_ids = validation[validation['LabelName'].isin(labels_ids)]["ImageID"]
other_rows = validation[~validation['ImageID'].isin(label_image_ids)]

print(validation.shape, other_rows.shape, label_image_ids.shape)

candidates = [[] for _ in range(len(labels)+1)]
u = set()

for i in range(len(subsets)):
    rows = subsets[i]
    for index, row in rows.iterrows():
        area = (row["XMax"]-row["XMin"]) * (row["YMax"]-row["YMin"])
        if area >= min_area:
            u.add(row["ImageID"])
            candidates[i].append(row["ImageID"])
    print('len(candidates[i])', len(candidates[i]))

print("start other")

while len(candidates[-1]) < train_count: 
    if len(candidates[-1]) % 100 == 0:
        print('len(candidates[-1])', len(candidates[-1]))
        
    row = other_rows.iloc[random.randint(0,other_rows.shape[0]-1)]

    if row["ImageID"] not in u:
        candidates[-1].append(row["ImageID"])
        
print("done other")

members = [random.sample(c, train_count) for c in candidates]

for i, m in enumerate(members):
    name = labels[i] if i < len(subsets) else 'other'
    os.makedirs(f'dataset_{train_count}/train/{name}', exist_ok=True)
    for id in m:
        print(f'http://s3.amazonaws.com/open-images-dataset/train/{id}.jpg', f'dataset_{train_count}/train/{name}/{id}.jpg')
        download(f'http://s3.amazonaws.com/open-images-dataset/train/{id}.jpg', f'dataset_{train_count}/train/{name}/{id}.jpg') 

len(subsets) 15
(14610229, 13) (11097917, 13) (773856,)
len(candidates[i]) 7827
len(candidates[i]) 40164
len(candidates[i]) 5110
len(candidates[i]) 4349
len(candidates[i]) 3639
len(candidates[i]) 9522
len(candidates[i]) 13264
len(candidates[i]) 6994
len(candidates[i]) 2137
len(candidates[i]) 1418
len(candidates[i]) 4798
len(candidates[i]) 27286
len(candidates[i]) 57977
len(candidates[i]) 7148
len(candidates[i]) 5362
start other
len(candidates[-1]) 0
len(candidates[-1]) 100
len(candidates[-1]) 200
len(candidates[-1]) 300
len(candidates[-1]) 400
len(candidates[-1]) 500
len(candidates[-1]) 600
len(candidates[-1]) 700
len(candidates[-1]) 800
len(candidates[-1]) 900
done other
http://s3.amazonaws.com/open-images-dataset/train/3c30c479a2994450.jpg dataset_1000/train/airplane/3c30c479a2994450.jpg
http://s3.amazonaws.com/open-images-dataset/train/7a2a303769b63a4d.jpg dataset_1000/train/airplane/7a2a303769b63a4d.jpg
http://s3.amazonaws.com/open-images-dataset/train/0c13c198c38c9dde.jpg dataset_

KeyboardInterrupt: 

In [None]:
from PIL import Image
import numpy as np
pil_mask = Image.open("dataset_100/test/car/455c29cd8db5b225.mask.png").convert('1')
mask = np.array(pil_mask)
print(np.sum(mask))

In [None]:
# segmentation = pd.read_csv("/Users/racoon/Desktop/open-images-v5-metadata/test-annotations-object-segmentation.csv")
# item_counts = segmentation["LabelName"].value_counts()
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(item_counts)