In [11]:
from pathlib import Path

from gulpio2 import GulpDirectory
import numpy as np
import PIL.Image
from gulpio2.utils import img_to_jpeg_bytes

First we'll generate some random images and save them to disk to emulate having a real dataset.

In [12]:
n_images = 100
assert n_images % 10 == 0
images = np.random.randint(0, 255, (n_images, 224, 224, 3), dtype=np.uint8)
labels = np.repeat(np.arange(n_images // 10), n_images // 10)

In [13]:
for i, (img, label) in enumerate(zip(images, labels)):
    img_path = Path(f"fake-dataset/{label}/{i}.jpg")
    img_path.parent.mkdir(exist_ok=True, parents=True)
    with img_path.open("wb") as f:
        f.write(img_to_jpeg_bytes(img))

In [14]:
!tree fake-dataset

[34mfake-dataset[00m
├── [34m0[00m
│   ├── [33m0.jpg[00m
│   ├── [33m1.jpg[00m
│   ├── [33m2.jpg[00m
│   ├── [33m3.jpg[00m
│   ├── [33m4.jpg[00m
│   ├── [33m5.jpg[00m
│   ├── [33m6.jpg[00m
│   ├── [33m7.jpg[00m
│   ├── [33m8.jpg[00m
│   └── [33m9.jpg[00m
├── [34m1[00m
│   ├── [33m10.jpg[00m
│   ├── [33m11.jpg[00m
│   ├── [33m12.jpg[00m
│   ├── [33m13.jpg[00m
│   ├── [33m14.jpg[00m
│   ├── [33m15.jpg[00m
│   ├── [33m16.jpg[00m
│   ├── [33m17.jpg[00m
│   ├── [33m18.jpg[00m
│   └── [33m19.jpg[00m
├── [34m2[00m
│   ├── [33m20.jpg[00m
│   ├── [33m21.jpg[00m
│   ├── [33m22.jpg[00m
│   ├── [33m23.jpg[00m
│   ├── [33m24.jpg[00m
│   ├── [33m25.jpg[00m
│   ├── [33m26.jpg[00m
│   ├── [33m27.jpg[00m
│   ├── [33m28.jpg[00m
│   └── [33m29.jpg[00m
├── [34m3[00m
│   ├── [33m30.jpg[00m
│   ├── [33m31.jpg[00m
│   ├── [33m32.jpg[00m
│   ├── [33m33.jpg[00m
│   ├── [33m34.jpg[00m
│   ├── [33m35.jpg[00m
│   ├── [33m36.jpg[00

Now we'll gulp the dataset

In [15]:
%%bash
rm -rf fake-dataset-gulp
gulp2_image_folder \
    --images_per_chunk 100 \
    --num_workers 1 \
    --image_size 120 \
    --shuffle \
    fake-dataset/ \
    fake-dataset-gulp/

{'--help': False,
 '--image_size': '120',
 '--images_per_chunk': '100',
 '--num_workers': '1',
 '--shuffle': True,
 '--version': False,
 '<images_directory>': 'fake-dataset/',
 '<output_directory>': 'fake-dataset-gulp/'}


Chunks finished: 100%|██████████| 1/1 [00:00<00:00,  3.56chunk/s]


Now we have ingested our images into a gulp directory `fake-dataset-gulp`, let's see what's inside

In [16]:
!tree fake-dataset-gulp

[34mfake-dataset-gulp[00m
├── data_0.gulp
├── label2idx.json
└── meta_0.gmeta

0 directories, 3 files


Now we can load data and write a torch dataset class

In [17]:
gulp_dir = GulpDirectory('fake-dataset-gulp')

In [18]:
class GulpImageDataset:
    def __init__(self, gulp_dir: GulpDirectory, transform=None):
        self.gulp_dir = gulp_dir
        self.transform = transform if transform is not None else lambda x: x
        self.example_ids = list(gulp_dir.merged_meta_dict.keys())
    
    def __getitem__(self, idx):
        if isinstance(idx, int):
            example_id = self.example_ids[idx]
        else:
            example_id = idx
        imgs, meta = self.gulp_dir[example_id] 
        return self.transform(imgs[0]), meta
    
    def __len__(self):
        return len(self.gulp_dir.merged_meta_dict)

In [19]:
dataset = GulpImageDataset(gulp_dir)
print(len(dataset))

100


In [20]:
img, meta = dataset[0]
img.shape, meta

((120, 120, 3),
 {'id': '5-59.jpg', 'label': '5', 'path': 'fake-dataset/5', 'idx': 5})