In [1]:
import os
import time
import numpy as np

import albumentations as A
import albumentations.pytorch
import kornia as K
import torch.nn as nn
import torch.utils.data
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader

from set_loader import CustomDataset, CustomAlbDataset, CustomKorDataset

import cv2

cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
p = 1.0
albumentations_transform = A.Compose([
    A.RandomCrop(256, 256, p=p),
    A.HorizontalFlip(p=p),
    A.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), p=p),
    A.dropout.Cutout(num_holes=8, max_h_size=8, max_w_size=8, fill_value=0, p=p),
    A.pytorch.ToTensorV2(),
])



In [3]:
# jpeg4py + albumentation 
root_path = '/home/aiteam/tykim/dataset/afhq/train'
custom_ds = CustomAlbDataset(root_path, loader_type='jpeg4py', transform=albumentations_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)

In [4]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jpeg4py_alb_time = time.time() - start_time
simple_load_times.append(jpeg4py_alb_time)
print(str(simple_load_times) + ' sec')

[11.15326189994812] sec
[15.743764162063599] sec
[32.75279474258423] sec
[19.616486072540283] sec
[18.629342555999756] sec
[11.323187112808228] sec
[15.001330375671387] sec
[15.834532976150513] sec
[12.909913539886475] sec
17 s ± 2.19 s per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [2]:
# FFCV - JPEG 100 % quality

from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder, RandomResizedCropRGBImageDecoder
from ffcv.loader import Loader, OrderOption
from ffcv.transforms import ToDevice, ToTensor, ToTorchImage, NormalizeImage, RandomHorizontalFlip, Cutout

In [5]:
beton_path = '/home/aiteam/tykim/scratch/data_loaders/afhq_io_test.beton'

# Random resized crop
decoder = RandomResizedCropRGBImageDecoder(output_size=(256, 256))# SimpleRGBImageDecoder()
mean = [0.5023*255, 0.4599*255, 0.3993*255]
std = [0.2553*255, 0.2457*255, 0.2503*255]


# Data decoding and augmentation
image_pipeline = [decoder,  RandomHorizontalFlip(flip_prob=1.0), Cutout(8, tuple(map(int, mean))),
                  ToTensor(), ToTorchImage(), 
                  ToDevice('cuda:0', non_blocking=True), NormalizeImage(np.array(mean), np.array(std), np.float32)]
                  
label_pipeline = [IntDecoder(), ToTensor(), ToDevice('cuda:0')]

# Pipeline for each data field
pipelines = {
    'image': image_pipeline,
    'label': label_pipeline
}

# Replaces PyTorch data loader (`torch.utils.data.Dataloader`)
loader = Loader(beton_path, batch_size=128, num_workers=8,
                order=OrderOption.QUASI_RANDOM, pipelines=pipelines, os_cache=True)

In [6]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for batch_idx, data in enumerate(loader):
    inputs, labels = data
    
ffcv_time = time.time() - start_time
simple_load_times.append(ffcv_time)
print(str(simple_load_times) + ' sec') 

[9.98087763786316] sec
[3.21522855758667] sec
[3.1909334659576416] sec
[3.099224805831909] sec
[3.1774439811706543] sec
[3.286438465118408] sec
[3.2160937786102295] sec
[3.2460453510284424] sec
[3.1289079189300537] sec
3.95 s ± 1.07 s per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [8]:
# DALI 
from nvidia.dali.pipeline import pipeline_def
import nvidia.dali.types as types
import nvidia.dali.fn as fn
from nvidia.dali.plugin.pytorch import DALIGenericIterator

root_path = '/home/aiteam/tykim/dataset/afhq/train'

@pipeline_def(batch_size=128, num_threads=8, device_id=0)
def get_dali_pipeline(data_dir, dali_cpu=False, crop=256):
  dali_device = 'cpu' if dali_cpu else 'gpu'
  decoder_device = 'cpu' if dali_cpu else 'mixed'
  # w, h = int(crop * 120), int(crop * 400)
  # imagebytes = w * h * 3 * 4
  
  img_files, labels = fn.readers.file(file_root=data_dir, random_shuffle=False, name="Reader")
  
  # Load and Crop
  # 이미지 사이즈 힌트
  preallocate_width_hint = 512 if decoder_device == 'mixed' else 0
  preallocate_height_hint = 512 if decoder_device == 'mixed' else 0
  
  # images = fn.decoders.image(img_files, device="mixed")
  # Decode and Random Crop
  images = fn.decoders.image_random_crop(img_files, device=decoder_device, preallocate_width_hint=preallocate_width_hint,
                                         preallocate_height_hint=preallocate_height_hint,
                                         output_type=types.RGB, random_aspect_ratio=[0.8, 1.25], random_area=[0.1, 1.0])
  
  # Resize
  images = fn.resize(images,device=dali_device,resize_x=crop, resize_y=crop,interp_type=types.INTERP_TRIANGULAR)
  # Horizontal F
  images = fn.flip(images, device=dali_device, horizontal=1)


  # Cutout
  axis_names="WH"
  nregions=8
  ndims = len(axis_names)
  args_shape=(ndims*nregions,)
  random_anchor = fn.random.uniform(range=(0., 1.), shape=args_shape)
  random_shape = fn.random.uniform(range=(20., 50), shape=args_shape)
  fn.erase(images, device=dali_device, anchor=random_anchor, shape=random_shape,
            axis_names=axis_names, normalized_anchor=True,
            normalized_shape=False)
  # Normalization
  images = fn.crop_mirror_normalize(images, device=dali_device,
                                    dtype=types.FLOAT,
                                    mean = [0.5023*255, 0.4599*255, 0.3993*255],
                                    std = [0.2553*255, 0.2457*255, 0.2503*255])
  return images, labels.gpu()

pipe = get_dali_pipeline(data_dir=root_path)
pipe.build()

dataloader = DALIGenericIterator(pipe, ['data', 'label'],reader_name='Reader')

In [9]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for i, data in enumerate(dataloader):
  x, y = data[0]['data'], data[0]['label']  

dali_time = time.time() - start_time
simple_load_times.append(dali_time)
print(str(simple_load_times) + ' sec') 

[1.1839418411254883] sec
[1.1245269775390625] sec
[1.1185104846954346] sec
[1.121791124343872] sec
[1.1238594055175781] sec
[1.1269352436065674] sec
[1.1241281032562256] sec
[1.1282238960266113] sec
[1.1090571880340576] sec
1.13 s ± 9.55 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


# CIFAR-10

In [12]:
p = 1.0
albumentations_transform = A.Compose([
    A.RandomCrop(16, 16, p=p),
    A.HorizontalFlip(p=p),
    A.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), p=p),
    A.dropout.Cutout(num_holes=8, max_h_size=8, max_w_size=8, fill_value=0, p=p),
    A.pytorch.ToTensorV2(),
])



In [13]:
root_path = '/home/aiteam/tykim/dataset/CIFAR-10-images/train'
custom_ds = CustomAlbDataset(root_path, loader_type='jpeg4py', transform=albumentations_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)

In [14]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jpeg4py_alb_time = time.time() - start_time
simple_load_times.append(jpeg4py_alb_time)
print(str(simple_load_times) + ' sec')

[4.6176698207855225] sec
[4.637690305709839] sec
[4.58872389793396] sec
[4.410037279129028] sec
[4.6040894985198975] sec
[4.494801759719849] sec
[4.464649438858032] sec
[4.90467381477356] sec
[5.105956554412842] sec
4.65 s ± 134 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [7]:
# DALI 
from nvidia.dali.pipeline import pipeline_def
import nvidia.dali.types as types
import nvidia.dali.fn as fn
from nvidia.dali.plugin.pytorch import DALIGenericIterator

root_path = '/home/aiteam/tykim/dataset/CIFAR-10-images/train'

@pipeline_def(batch_size=128, num_threads=8, device_id=0)
def get_dali_pipeline(data_dir, dali_cpu=False, crop=16):
  dali_device = 'cpu' if dali_cpu else 'gpu'
  decoder_device = 'cpu' if dali_cpu else 'mixed'
  # w, h = int(crop * 120), int(crop * 400)
  # imagebytes = w * h * 3 * 4
  
  img_files, labels = fn.readers.file(file_root=data_dir, random_shuffle=False, name="Reader")
  
  # Load and Crop
  # 이미지 사이즈 힌트
  preallocate_width_hint = 32 if decoder_device == 'mixed' else 0
  preallocate_height_hint = 32 if decoder_device == 'mixed' else 0
  
  # images = fn.decoders.image(img_files, device="mixed")
  # Decode and Random Crop
  images = fn.decoders.image_random_crop(img_files, device=decoder_device, preallocate_width_hint=preallocate_width_hint,
                                         preallocate_height_hint=preallocate_height_hint,
                                         output_type=types.RGB, random_aspect_ratio=[0.8, 1.25], random_area=[0.1, 1.0])
  
  # Resize
  images = fn.resize(images,device=dali_device,resize_x=crop, resize_y=crop,interp_type=types.INTERP_TRIANGULAR)
  # Horizontal F
  images = fn.flip(images, device=dali_device, horizontal=1)


  # Cutout
  axis_names="WH"
  nregions=8
  ndims = len(axis_names)
  args_shape=(ndims*nregions,)
  random_anchor = fn.random.uniform(range=(0., 1.), shape=args_shape)
  random_shape = fn.random.uniform(range=(20., 50), shape=args_shape)
  fn.erase(images, device=dali_device, anchor=random_anchor, shape=random_shape,
            axis_names=axis_names, normalized_anchor=True,
            normalized_shape=False)
  # Normalization
  images = fn.crop_mirror_normalize(images, device=dali_device,
                                    dtype=types.FLOAT,
                                    mean = [0.5023*255, 0.4599*255, 0.3993*255],
                                    std = [0.2553*255, 0.2457*255, 0.2503*255])
  return images, labels.gpu()

pipe = get_dali_pipeline(data_dir=root_path)
pipe.build()

dataloader = DALIGenericIterator(pipe, ['data', 'label'],reader_name='Reader')# DALI 


In [8]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for i, data in enumerate(dataloader):
  x, y = data[0]['data'], data[0]['label']  

dali_time = time.time() - start_time
simple_load_times.append(dali_time)
print(str(simple_load_times) + ' sec') 

[0.9557044506072998] sec
[0.8870923519134521] sec
[0.8955788612365723] sec
[0.9180748462677002] sec
[0.8786141872406006] sec
[0.8949985504150391] sec
[0.8926410675048828] sec
[0.8861455917358398] sec
[0.8754441738128662] sec
898 ms ± 11.5 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [9]:
# FFCV - JPEG 100 % quality

from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder, RandomResizedCropRGBImageDecoder
from ffcv.loader import Loader, OrderOption
from ffcv.transforms import ToDevice, ToTensor, ToTorchImage, NormalizeImage, RandomHorizontalFlip, Cutout

beton_path = '/home/aiteam/tykim/scratch/data_loaders/cifar10_io_test.beton'


In [10]:
# Random resized crop
decoder = RandomResizedCropRGBImageDecoder(output_size=(16, 16))# SimpleRGBImageDecoder()
mean = [0.5023*255, 0.4599*255, 0.3993*255]
std = [0.2553*255, 0.2457*255, 0.2503*255]


# Data decoding and augmentation
image_pipeline = [decoder,  RandomHorizontalFlip(flip_prob=1.0), Cutout(8, tuple(map(int, mean))),
                  ToTensor(), ToTorchImage(), 
                  ToDevice('cuda:0', non_blocking=True), NormalizeImage(np.array(mean), np.array(std), np.float32)]
                  
label_pipeline = [IntDecoder(), ToTensor(), ToDevice('cuda:0')]

# Pipeline for each data field
pipelines = {
    'image': image_pipeline,
    'label': label_pipeline
}

# Replaces PyTorch data loader (`torch.utils.data.Dataloader`)
loader = Loader(beton_path, batch_size=128, num_workers=8,
                order=OrderOption.QUASI_RANDOM, pipelines=pipelines, os_cache=True)

In [11]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for batch_idx, data in enumerate(loader):
    inputs, labels = data
    
ffcv_time = time.time() - start_time
simple_load_times.append(ffcv_time)
print(str(simple_load_times) + ' sec') 

[7.137673854827881] sec
[0.42787671089172363] sec
[0.3739004135131836] sec
[0.3023409843444824] sec
[0.31604552268981934] sec
[0.31195735931396484] sec
[0.419175386428833] sec
[0.34900712966918945] sec
[0.3009357452392578] sec
The slowest run took 8.53 times longer than the fastest. This could mean that an intermediate result is being cached.
1.1 s ± 1.09 s per loop (mean ± std. dev. of 3 runs, 3 loops each)
