In [1]:
import os
import time
import numpy as np

import albumentations as A
import albumentations.pytorch
import kornia as K
import torch.nn as nn
import torch.utils.data
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader

from set_loader import CustomDataset, CustomAlbDataset, CustomKorDataset

import cv2

cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
p = 1.0
albumentations_transform = A.Compose([
    A.RandomCrop(256, 256, p=p),
    A.ColorJitter(0.2, 0.2, 0.2, 0.125, p=p),
    A.GaussianBlur(3, (0.1, 3.0), p=p),
    A.Rotate((-10, 10), p=p),
    A.HorizontalFlip(p=p),
    A.VerticalFlip(p=p),
    A.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), p=p),
    A.pytorch.ToTensorV2(),
])
kornia_transform = nn.Sequential(
    K.augmentation.RandomCrop((256, 256), p=p),
    K.augmentation.ColorJitter(0.2, 0.2, 0.2, 0.125, p=p),
    K.augmentation.RandomGaussianBlur((3, 3), (0.1, 3.0), p=p),
    K.augmentation.RandomRotation([-10, 10], p=p),
    K.augmentation.RandomHorizontalFlip(p=p),
    K.augmentation.RandomVerticalFlip(p=p),
    K.augmentation.Normalize(torch.Tensor([0.5, 0.5, 0.5])*255, torch.Tensor([0.5, 0.5, 0.5])*255, p=p),
)


torchvision_transform = T.Compose([
    T.RandomCrop([256, 256]),
    T.ColorJitter(0.2, 0.2, 0.2, 0.125),  #Randomly change the brightness, contrast, saturation and hue of an image.
    T.GaussianBlur(3, (0.1, 3.0)),
    T.RandomRotation([-10, 10], T.InterpolationMode.BILINEAR),
    T.RandomHorizontalFlip(p=p),
    T.RandomVerticalFlip(p=p),
    T.ToTensor(),
    T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])


# AFHQ


In [4]:
# torchivison - PIL loader

root_path = '/home/aiteam/tykim/dataset/afhq/train'
custom_ds = CustomDataset(root_path, loader_type='pil', transform=torchvision_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)

In [11]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jpeg4py_alb_time = time.time() - start_time
simple_load_times.append(jpeg4py_alb_time)
print(str(simple_load_times) + ' sec')

[29.005223035812378] sec
[28.97568106651306] sec
[28.89587140083313] sec
[29.200836658477783] sec
[28.76229977607727] sec
[28.331727743148804] sec
[28.6800274848938] sec
[28.654899835586548] sec
[29.075637817382812] sec
28.8 s ± 83.9 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [5]:
# jpeg4py + albumentation 
root_path = '/home/aiteam/tykim/dataset/afhq/train'
custom_ds = CustomAlbDataset(root_path, loader_type='jpeg4py', transform=albumentations_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)
# custom_ds.transform(image=np.random.randn(512, 512,3).astype(np.float32))

In [8]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jpeg4py_alb_time = time.time() - start_time
simple_load_times.append(jpeg4py_alb_time)
print(str(simple_load_times) + ' sec')

torch.Size([128])
[1.623840093612671] sec
1.62 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [3]:
# JPEG4PY > PIL > torchivision
root_path = '/home/aiteam/tykim/dataset/afhq/train'
custom_ds = CustomDataset(root_path, loader_type='jp4pil', transform=torchvision_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)

In [4]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jp4pil_tv_time = time.time() - start_time
simple_load_times.append(jp4pil_tv_time)
print(str(simple_load_times) + ' sec')

[32.16102433204651] sec
[51.36891031265259] sec
[28.907237768173218] sec
[43.50913190841675] sec
[29.794763326644897] sec


: 

: 

In [3]:
# jpeg4py + kornia-cpu
root_path = '/home/aiteam/tykim/dataset/afhq/train'
custom_ds = CustomKorDataset(root_path, loader_type='jpeg4py', transform=kornia_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)
# custom_ds.transform(image=np.random.randn(512, 512,3).astype(np.float32))

In [4]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jpeg4py_kornia_time = time.time() - start_time
simple_load_times.append(jpeg4py_kornia_time)
print(str(simple_load_times) + ' sec')

[42.58431816101074] sec
[40.74731254577637] sec
[40.477243423461914] sec
[40.83574032783508] sec
[40.696980237960815] sec
[40.93867015838623] sec
[40.643062114715576] sec
[40.99645209312439] sec
[40.961018323898315] sec
41 s ± 201 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [22]:
# DALI 
from nvidia.dali.pipeline import pipeline_def
import nvidia.dali.types as types
import nvidia.dali.fn as fn
from nvidia.dali.plugin.pytorch import DALIGenericIterator

root_path = '/home/aiteam/tykim/dataset/afhq/train'

@pipeline_def(batch_size=128, num_threads=8, device_id=0)
def get_dali_pipeline(data_dir, dali_cpu=False, crop=256):
  dali_device = 'cpu' if dali_cpu else 'gpu'
  decoder_device = 'cpu' if dali_cpu else 'mixed'
  # w, h = int(crop * 120), int(crop * 400)
  # imagebytes = w * h * 3 * 4
  
  img_files, labels = fn.readers.file(file_root=data_dir, random_shuffle=False, name="Reader")
  
  # Load and Crop
  # 이미지 사이즈 힌트
  preallocate_width_hint = 512 if decoder_device == 'mixed' else 0
  preallocate_height_hint = 512 if decoder_device == 'mixed' else 0
  
  # images = fn.decoders.image(img_files, device="mixed")
  # Decode and Random Crop
  images = fn.decoders.image_random_crop(img_files, device=decoder_device, preallocate_width_hint=preallocate_width_hint,
                                         preallocate_height_hint=preallocate_height_hint,
                                         output_type=types.RGB, random_aspect_ratio=[0.8, 1.25], random_area=[0.1, 1.0])
  
  # Resize
  images = fn.resize(images,device=dali_device,resize_x=crop, resize_y=crop,interp_type=types.INTERP_TRIANGULAR)
  # Jitter 
  images = fn.color_twist(images, brightness=0.2, contrast=0.2, saturation=0.2, hue=0.125)
  # GaussianBlur
  images = fn.gaussian_blur(images, device = dali_device, # bytes_per_sample_hint = imagebytes, 
                   sigma = fn.random.uniform(range = (0.1, 3.0)), window_size = 3)
  # # Random Rotation
  images = fn.rotate(images, device=dali_device, angle=fn.random.uniform(range=(-10, 10)), keep_size=True) 
  # Horizontal F
  images = fn.flip(images, device=dali_device, horizontal=1)
  # Vertical F
  images = fn.flip(images, device=dali_device, horizontal=0)
  # Normalization
  images = fn.crop_mirror_normalize(images, device=dali_device,
                                    dtype=types.FLOAT,
                                    mean = [0.5023*255, 0.4599*255, 0.3993*255],
                                    std = [0.2553*255, 0.2457*255, 0.2503*255])
  return images, labels.gpu()

pipe = get_dali_pipeline(data_dir=root_path)
pipe.build()

dataloader = DALIGenericIterator(pipe, ['data', 'label'],reader_name='Reader')

In [25]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for i, data in enumerate(dataloader):
  x, y = data[0]['data'], data[0]['label']  

dali_time = time.time() - start_time
simple_load_times.append(dali_time)
print(str(simple_load_times) + ' sec') 

[1.2623844146728516] sec
[1.219261646270752] sec
[1.2167339324951172] sec
[1.2073204517364502] sec
[1.2053849697113037] sec
[1.2156667709350586] sec
[1.2006504535675049] sec
[1.2037441730499268] sec
[1.209874153137207] sec
1.22 s ± 12.2 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [28]:
# FFCV - JPEG 100 % quality

from ffcv.writer import DatasetWriter
from ffcv.fields import IntField, RGBImageField
from torchvision.datasets import ImageFolder

from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder, RandomResizedCropRGBImageDecoder
from ffcv.loader import Loader, OrderOption
from ffcv.transforms import ToDevice, ToTensor, ToTorchImage, NormalizeImage, RandomHorizontalFlip

In [47]:
beton_path = '/home/aiteam/tykim/scratch/data_loaders/afhq_io_test.beton'

# Random resized crop
decoder = RandomResizedCropRGBImageDecoder(output_size=(256, 256))# SimpleRGBImageDecoder()
mean = np.array([0.5023, 0.4599, 0.3993]) * 255
std = np.array([0.2553, 0.2457, 0.2503]) * 255
# Data decoding and augmentation
image_pipeline = [decoder,  RandomHorizontalFlip(flip_prob=1.0),
                  ToTensor(), ToTorchImage(), ToDevice('cuda:0', non_blocking=True),]
                  #NormalizeImage(mean, std)] #Cutout(),
label_pipeline = [IntDecoder(), ToTensor(), ToDevice('cuda:0')]

# Pipeline for each data field
pipelines = {
    'image': image_pipeline,
    'label': label_pipeline
}

# Replaces PyTorch data loader (`torch.utils.data.Dataloader`)
loader = Loader(beton_path, batch_size=128, num_workers=8,
                order=OrderOption.RANDOM, pipelines=pipelines, os_cache=True)


kornia_transform_ffcv = nn.Sequential(
    K.augmentation.ColorJitter(0.2, 0.2, 0.2, 0.125, p=1.0),
    K.augmentation.RandomGaussianBlur((3, 3), (0.1, 3.0), p=1.0),
    K.augmentation.RandomRotation([-10, 10], p=1.0),
    K.augmentation.RandomVerticalFlip(p=1.0),
    K.augmentation.Normalize(torch.tensor([0.5023*255, 0.4599*255, 0.3993*255]),
                             torch.tensor([0.2553*255, 0.2457*255, 0.2503*255]), p=1.0))

In [48]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for batch_idx, data in enumerate(loader):
    inputs, labels = data
    inputs = kornia_transform_ffcv(inputs.to(torch.float32))
ffcv_time = time.time() - start_time
simple_load_times.append(ffcv_time)
print(str(simple_load_times) + ' sec') 

[9.15502142906189] sec
[3.9004931449890137] sec
[3.6862740516662598] sec
[3.629631996154785] sec
[3.547180414199829] sec
[3.682373046875] sec
[3.4746298789978027] sec
[3.677720785140991] sec
[3.4508635997772217] sec
4.25 s ± 945 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


# CIFAR-10

In [3]:
p = 1.0
albumentations_transform = A.Compose([
    A.RandomCrop(16, 16, p=p),
    A.ColorJitter(0.2, 0.2, 0.2, 0.125, p=p),
    A.GaussianBlur(3, (0.1, 3.0), p=p),
    A.Rotate((-10, 10), p=p),
    A.HorizontalFlip(p=p),
    A.VerticalFlip(p=p),
    A.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), p=p),
    A.pytorch.ToTensorV2(),
])
kornia_transform = nn.Sequential(
    K.augmentation.RandomCrop((16, 16), p=p),
    K.augmentation.ColorJitter(0.2, 0.2, 0.2, 0.125, p=p),
    K.augmentation.RandomGaussianBlur((3, 3), (0.1, 3.0), p=p),
    K.augmentation.RandomRotation([-10, 10], p=p),
    K.augmentation.RandomHorizontalFlip(p=p),
    K.augmentation.RandomVerticalFlip(p=p),
    K.augmentation.Normalize(torch.Tensor([0.5, 0.5, 0.5])*255, torch.Tensor([0.5, 0.5, 0.5])*255, p=p),
)


torchvision_transform = T.Compose([
    T.RandomCrop([16, 16]),
    T.ColorJitter(0.2, 0.2, 0.2, 0.125),  #Randomly change the brightness, contrast, saturation and hue of an image.
    T.GaussianBlur(3, (0.1, 3.0)),
    T.RandomRotation([-10, 10], T.InterpolationMode.BILINEAR),
    T.RandomHorizontalFlip(p=p),
    T.RandomVerticalFlip(p=p),
    T.ToTensor(),
    T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])


In [4]:
# torchivison - PIL loader

root_path = '/home/aiteam/tykim/dataset/CIFAR-10-images/train'
custom_ds = CustomDataset(root_path, loader_type='pil', transform=torchvision_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)

In [37]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
pil_time = time.time() - start_time
simple_load_times.append(pil_time)
print(str(simple_load_times) + ' sec')

[8.245321273803711] sec
[8.256505012512207] sec
[8.146782159805298] sec
[8.263944387435913] sec
[8.702878475189209] sec
[8.481176137924194] sec
[8.087376117706299] sec
[8.03583288192749] sec
[8.608634948730469] sec
8.31 s ± 120 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [3]:
# jpeg4py + albumentation 
root_path = '/home/aiteam/tykim/dataset/CIFAR-10-images/train'
custom_ds = CustomAlbDataset(root_path, loader_type='jpeg4py', transform=albumentations_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)

In [4]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jpeg4py_alb_time = time.time() - start_time
simple_load_times.append(jpeg4py_alb_time)
print(str(simple_load_times) + ' sec')

[5.560781002044678] sec
[3.701923131942749] sec
[3.6266701221466064] sec
[3.5955018997192383] sec
[3.5598294734954834] sec
[3.582918167114258] sec
[3.7467784881591797] sec
[3.6714329719543457] sec
[3.7053630352020264] sec
3.86 s ± 312 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [40]:
# jpeg4py + kornia-cpu
root_path = '/home/aiteam/tykim/dataset/CIFAR-10-images/train'
custom_ds = CustomKorDataset(root_path, loader_type='jpeg4py', transform=kornia_transform)
dataloader = DataLoader(custom_ds, batch_size=128, shuffle=False, num_workers=8)

In [41]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for image, label in dataloader:
    image = image.cuda()
    label = label.cuda()
    pass
jpeg4py_kornia_time = time.time() - start_time
simple_load_times.append(jpeg4py_kornia_time)
print(str(simple_load_times) + ' sec')

[23.816331148147583] sec
[23.018821239471436] sec
[23.119860410690308] sec
[26.715492725372314] sec
[22.96969747543335] sec
[23.019802808761597] sec
[22.935904502868652] sec
[23.335483074188232] sec
[23.39503240585327] sec
23.6 s ± 456 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [42]:
# DALI 
from nvidia.dali.pipeline import pipeline_def
import nvidia.dali.types as types
import nvidia.dali.fn as fn
from nvidia.dali.plugin.pytorch import DALIGenericIterator

root_path = '/home/aiteam/tykim/dataset/CIFAR-10-images/train'

@pipeline_def(batch_size=128, num_threads=8, device_id=0)
def get_dali_pipeline(data_dir, dali_cpu=False, crop=16):
  dali_device = 'cpu' if dali_cpu else 'gpu'
  decoder_device = 'cpu' if dali_cpu else 'mixed'
  # w, h = int(crop * 120), int(crop * 400)
  # imagebytes = w * h * 3 * 4
  
  img_files, labels = fn.readers.file(file_root=data_dir, random_shuffle=False, name="Reader")
  
  # Load and Crop
  # 이미지 사이즈 힌트
  preallocate_width_hint = 32 if decoder_device == 'mixed' else 0
  preallocate_height_hint = 32 if decoder_device == 'mixed' else 0
  
  # images = fn.decoders.image(img_files, device="mixed")
  # Decode and Random Crop
  images = fn.decoders.image_random_crop(img_files, device=decoder_device, preallocate_width_hint=preallocate_width_hint,
                                         preallocate_height_hint=preallocate_height_hint,
                                         output_type=types.RGB, random_aspect_ratio=[0.8, 1.25], random_area=[0.1, 1.0])
  
  # Resize
  images = fn.resize(images,device=dali_device,resize_x=crop, resize_y=crop,interp_type=types.INTERP_TRIANGULAR)
  # Jitter 
  images = fn.color_twist(images, brightness=0.2, contrast=0.2, saturation=0.2, hue=0.125)
  # GaussianBlur
  images = fn.gaussian_blur(images, device = dali_device, # bytes_per_sample_hint = imagebytes, 
                   sigma = fn.random.uniform(range = (0.1, 3.0)), window_size = 3)
  # # Random Rotation
  images = fn.rotate(images, device=dali_device, angle=fn.random.uniform(range=(-10, 10)), keep_size=True) 
  # Horizontal F
  images = fn.flip(images, device=dali_device, horizontal=1)
  # Vertical F
  images = fn.flip(images, device=dali_device, horizontal=0)
  # Normalization
  images = fn.crop_mirror_normalize(images, device=dali_device,
                                    dtype=types.FLOAT,
                                    mean = [0.5*255, 0.5*255, 0.5*255],
                                    std = [0.5*255, 0.5*255, 0.5*255])
  return images, labels.gpu()

pipe = get_dali_pipeline(data_dir=root_path)
pipe.build()

dataloader = DALIGenericIterator(pipe, ['data', 'label'],reader_name='Reader')


In [43]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for i, data in enumerate(dataloader):
  x, y = data[0]['data'], data[0]['label']  

dali_time = time.time() - start_time
simple_load_times.append(dali_time)
print(str(simple_load_times) + ' sec') 

[1.595245599746704] sec
[1.5693223476409912] sec
[1.5602564811706543] sec
[1.576174020767212] sec
[1.5647480487823486] sec
[1.5459952354431152] sec
[1.4748666286468506] sec
[1.4407780170440674] sec
[1.4309241771697998] sec
1.53 s ± 56.6 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [44]:
from ffcv.writer import DatasetWriter
from ffcv.fields import IntField, RGBImageField
from torchvision.datasets import ImageFolder

from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder, RandomResizedCropRGBImageDecoder
from ffcv.loader import Loader, OrderOption
from ffcv.transforms import ToDevice, ToTensor, ToTorchImage, NormalizeImage, RandomHorizontalFlip



beton_path = '/home/aiteam/tykim/scratch/data_loaders/cifar10_io_test.beton'

In [45]:
# Random resized crop
decoder = RandomResizedCropRGBImageDecoder(output_size=(16, 16))# SimpleRGBImageDecoder()

# Data decoding and augmentation
image_pipeline = [decoder,  RandomHorizontalFlip(flip_prob=1.0),
                  ToTensor(), ToTorchImage(), ToDevice('cuda:0', non_blocking=True),]
                  #NormalizeImage(mean, std)] #Cutout(),
label_pipeline = [IntDecoder(), ToTensor(), ToDevice('cuda:0')]

# Pipeline for each data field
pipelines = {
    'image': image_pipeline,
    'label': label_pipeline
}

# Replaces PyTorch data loader (`torch.utils.data.Dataloader`)
loader = Loader(beton_path, batch_size=128, num_workers=8,
                order=OrderOption.RANDOM, pipelines=pipelines, os_cache=True)


kornia_transform_ffcv = nn.Sequential(
    K.augmentation.ColorJitter(0.2, 0.2, 0.2, 0.125, p=1.0),
    K.augmentation.RandomGaussianBlur((3, 3), (0.1, 3.0), p=1.0),
    K.augmentation.RandomRotation([-10, 10], p=1.0),
    K.augmentation.RandomVerticalFlip(p=1.0),
    K.augmentation.Normalize(torch.tensor([0.5*255, 0.5*255, 0.5*255]),
                             torch.tensor([0.5*255, 0.5*255, 0.5*255]), p=1.0))

In [46]:
%%timeit -r 3 -n 3
simple_load_times = []
start_time = time.time()
for batch_idx, data in enumerate(loader):
    inputs, labels = data
    inputs = kornia_transform_ffcv(inputs.to(torch.float32))
ffcv_time = time.time() - start_time
simple_load_times.append(ffcv_time)
print(str(simple_load_times) + ' sec') 

[9.300191879272461] sec
[4.742574214935303] sec
[4.702749729156494] sec
[4.74065637588501] sec
[4.718742847442627] sec
[4.7166428565979] sec
[4.7094902992248535] sec
[4.333574533462524] sec
[3.7038724422454834] sec
5.07 s ± 853 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
