Convert ImageNet Directory to WebDataset Tar Files
==================================================

This converts the original ImageNet dataset to sharded tar files. The conversion makes it easy to use ImageNet data with the WebDataset input pipelines. It also associates image data directly with ground truth, making it generally easier to process and transform this data.

Most of the code in this conversion is really concerned with loading metadata from the Matlab metadata files; the actual writing of the sharded tar files is simple.

If you're happy manipulating the Matlab files directly, you can also simply create a tar file of the Imagenet .JPEG files (with a command like `find train -name '*.JPEG' -print | tar -ctf - -T - > imagenet-images.tar`).

In [2]:
%cd /mdata/imagenet-raw

/mdata/imagenet-raw


In [4]:
import os, sys, glob, os.path, sqlite3
import random as pyr
import re
import PIL.Image
import numpy as np
import io
import xmltodict
import warnings
import simplejson
import itertools as itt
import random

def readfile(path, mode="rb"):
    with open(path, mode) as stream:
        return stream.read()
def writefile(path, data):
    mode = "w" if isinstance(data, str) else "wb"
    with open(path, mode) as stream:
        stream.write(data)
def pilreads(data):
    stream = io.BytesIO(data)
    return np.array(PIL.Image.open(stream))

In [5]:
jpegs = sorted(glob.glob("train/*/*.JPEG"))
print(len(jpegs), len(glob.glob("train/*/*.xml")))

1281167 544546


In [7]:
import scipy.io
meta = scipy.io.loadmat("ILSVRC2012_devkit_t12/data/meta.mat")
meta = meta["synsets"]
def scalar(x):
    for i in range(10):
        if isinstance(x, str): break
        try: x = x[0]
        except: break
    return x
wnid2id = {scalar(l[0][1]): int(scalar(l[0][0])) for l in meta}
wnid2cname = {scalar(l[0][1]): str(scalar(l[0][2])) for l in meta}
print(list(wnid2id.items())[:5])
print(list(wnid2cname.items())[:5])

[('n02119789', 1), ('n02100735', 2), ('n02110185', 3), ('n02096294', 4), ('n02102040', 5)]
[('n02119789', 'kit fox, Vulpes macrotis'), ('n02100735', 'English setter'), ('n02110185', 'Siberian husky'), ('n02096294', 'Australian terrier'), ('n02102040', 'English springer, English springer spaniel')]


In [8]:
mode = "train"
def pathinfo(path):
    global mode
    if mode=="val":
        match = re.search(r"^[a-z]*/([^/]+)/ILSVRC2012_val_(\d+)\.JPEG", path)
    elif mode=="train":
        match = re.search(r"^[a-z]*/([^/]+)/\1_(\d+)\.JPEG", path)
    return match.group(1), int(match.group(2))
print(jpegs[3])
pathinfo(jpegs[3])

train/n01440764/n01440764_10040.JPEG


('n01440764', 10040)

In [9]:
def pathkey(path):
    return re.sub('.JPEG$', '', re.sub('.*/', '', path))

pathkey(jpegs[3])

'n01440764_10040'

In [10]:
def pathcls(path):
    return wnid2id[pathinfo(path)[0]]

pathcls(jpegs[3])

449

In [11]:
def jpeginfo(path):
    xmlpath = re.sub(".JPEG$", ".xml", path)
    if not os.path.exists(xmlpath):
        info = {}
    else:
        xml = readfile(xmlpath, "r")
        info = xmltodict.parse(xml)
    folder = pathinfo(path)[0]
    info["cls"] = wnid2id[folder]
    info["cname"] = wnid2cname[folder]
    return info

infos = [jpeginfo(jpegs[i]) for i in range(100)]
infos = list(filter(lambda a: a is not None, infos))
print(simplejson.dumps(infos[0], indent=4))

{
    "cls": 449,
    "cname": "tench, Tinca tinca"
}


In [12]:
try: del writer
except: pass
!rm -rf ../imagenet-shards
!mkdir ../imagenet-shards

In [19]:
from webdataset import writer
from importlib import reload
reload(writer)

def write_shards(dest, jpegs, maxsize=1e9):
    jpegs = jpegs.copy()
    random.shuffle(jpegs)
    sink = writer.ShardWriter(dest, maxsize=maxsize, encoder=False)
    for i, fname in enumerate(jpegs):
        key = pathkey(fname)
        jpeg = readfile(fname)
        info = jpeginfo(fname)
        cls = pathcls(fname)    
        if info is None: info = dict(cls=cls)
        assert cls == info["cls"]
        json = simplejson.dumps(info)
        if i%1000==0: print(i, key, len(jpeg), json[:50])
        sample = dict(__key__=key,
                      jpg=jpeg,
                      json=json.encode("utf-8"),
                      cls=str(cls).encode("utf-8"))
        sink.write(sample)
    sink.close()

In [20]:
write_shards("../imagenet-shards/imagenet_train-%04d.tgz", jpegs)

# writing ../imagenet-shards/imagenet_train-0000.tgz 0 0.0 GB 0
0 n02096437_3246 16426 {"cls": 27, "cname": "Dandie Dinmont, Dandie Dinmo
1000 n03594734_25561 152316 {"cls": 748, "cname": "jean, blue jean, denim"}
2000 n02281406_8284 102584 {"annotation": {"folder": "n02281406", "filename":
3000 n03814906_45698 135129 {"annotation": {"folder": "n03814906", "filename":
4000 n01641577_3923 128818 {"annotation": {"folder": "n01641577", "filename":
5000 n07684084_542 27894 {"annotation": {"folder": "n07684084", "filename":
6000 n04235860_2054 22025 {"cls": 943, "cname": "sleeping bag"}
7000 n03825788_16277 12638 {"cls": 915, "cname": "nipple"}
8000 n03935335_8263 23508 {"cls": 931, "cname": "piggy bank, penny bank"}
# writing ../imagenet-shards/imagenet_train-0001.tgz 8838 1.0 GB 8838
9000 n02823428_4503 160711 {"cls": 777, "cname": "beer bottle"}
10000 n02033041_7018 94882 {"cls": 436, "cname": "dowitcher"}
11000 n03527444_15413 75635 {"annotation": {"folder": "n03527444", "filename":
120

KeyboardInterrupt: 

In [147]:
jpegs = sorted(glob.glob("val/*/*.JPEG"))
print(len(jpegs), len(glob.glob("val/*/*.xml")))

50000 0


In [157]:
mode = "val"
write_shards("../imagenet-shards/imagenet_val-%04d.tgz", jpegs, maxsize=1e11)

# writing ../imagenet-shards/imagenet_val-0000.tgz 0 0.0 GB 0
0 ILSVRC2012_val_00004977 143281 {"cls": 154, "cname": "Gordon setter"}
1000 ILSVRC2012_val_00025298 140138 {"cls": 526, "cname": "sundial"}
2000 ILSVRC2012_val_00009785 156178 {"cls": 326, "cname": "pomegranate"}
3000 ILSVRC2012_val_00024349 180064 {"cls": 65, "cname": "hartebeest"}
4000 ILSVRC2012_val_00014632 44827 {"cls": 551, "cname": "hand-held computer, hand-he
5000 ILSVRC2012_val_00024408 159608 {"cls": 883, "cname": "perfume, essence"}
6000 ILSVRC2012_val_00017938 138679 {"cls": 143, "cname": "pug, pug-dog"}
7000 ILSVRC2012_val_00005886 129133 {"cls": 197, "cname": "Pembroke, Pembroke Welsh co
8000 ILSVRC2012_val_00022434 210812 {"cls": 89, "cname": "toy terrier"}
9000 ILSVRC2012_val_00020606 40498 {"cls": 935, "cname": "Windsor tie"}
10000 ILSVRC2012_val_00027716 142984 {"cls": 531, "cname": "syringe"}
11000 ILSVRC2012_val_00028913 137064 {"cls": 133, "cname": "Leonberg"}
12000 ILSVRC2012_val_00038677 150307 {"cls"