In [1]:
import torch, math

import threading
from torch.multiprocessing import Event
from torch._six import queue
try:
    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
    from nvidia.dali.pipeline import Pipeline
    import nvidia.dali.ops as ops
    import nvidia.dali.types as types
except ImportError:
    raise ImportError("Please install DALI from https://www.github.com/NVIDIA/DALI to run this example.")


class HybridTrainPipe(Pipeline):
    """
    DALI Train Pipeline
    Based on the official example: https://github.com/NVIDIA/DALI/blob/master/docs/examples/pytorch/resnet50/main.py
    In comparison to the example, the CPU backend does more computation on CPU, reducing GPU load & memory use.
    This dataloader implements ImageNet style training preprocessing, namely:
    -random resized crop
    -random horizontal flip
    batch_size (int): how many samples per batch to load
    num_threads (int): how many DALI workers to use for data loading.
    device_id (int): GPU device ID
    data_dir (str): Directory to dataset.  Format should be the same as torchvision dataloader,
    containing train & val subdirectories, with image class subfolders
    crop (int): Image output size (typically 224 for ImageNet)
    mean (tuple): Image mean value for each channel
    std (tuple): Image standard deviation value for each channel
    local_rank (int, optional, default = 0) – Id of the part to read
    world_size (int, optional, default = 1) - Partition the data into this many parts (used for multiGPU training)
    dali_cpu (bool, optional, default = False) - Use DALI CPU mode instead of GPU
    shuffle (bool, optional, default = True) - Shuffle the dataset each epoch
    fp16 (bool, optional, default = False) - Output the data in fp16 instead of fp32 (GPU mode only)
    min_crop_size (float, optional, default = 0.08) - Minimum random crop size
    """

    def __init__(self, batch_size, num_threads, device_id, data_dir, crop,
                 mean, std, local_rank=0, world_size=1, dali_cpu=False, shuffle=True, fp16=False,
                 min_crop_size=0.08):

        # As we're recreating the Pipeline at every epoch, the seed must be -1 (random seed)
        super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=-1)

        # Enabling read_ahead slowed down processing ~40%
        self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size,
                                    random_shuffle=shuffle)

        # Let user decide which pipeline works best with the chosen model
        if dali_cpu:
            decode_device = "cpu"
            self.dali_device = "cpu"
            self.flip = ops.Flip(device=self.dali_device)
        else:
            decode_device = "mixed"
            self.dali_device = "gpu"

            output_dtype = types.FLOAT
            if self.dali_device == "gpu" and fp16:
                output_dtype = types.FLOAT16

            self.cmn = ops.CropMirrorNormalize(device="gpu",
                                               output_dtype=output_dtype,
                                               output_layout=types.NCHW,
                                               crop=(crop, crop),
                                               image_type=types.RGB,
                                               mean=mean,
                                               std=std,)

        # To be able to handle all images from full-sized ImageNet, this padding sets the size of the internal
        # nvJPEG buffers without additional reallocations
        device_memory_padding = 211025920 if decode_device == 'mixed' else 0
        host_memory_padding = 140544512 if decode_device == 'mixed' else 0
        self.decode = ops.ImageDecoderRandomCrop(device=decode_device, output_type=types.RGB,
                                                 device_memory_padding=device_memory_padding,
                                                 host_memory_padding=host_memory_padding,
                                                 random_aspect_ratio=[0.8, 1.25],
                                                 random_area=[min_crop_size, 1.0],
                                                 num_attempts=100)

        # Resize as desired.  To match torchvision data loader, use triangular interpolation.
        self.res = ops.Resize(device=self.dali_device, resize_x=crop, resize_y=crop,
                              interp_type=types.INTERP_TRIANGULAR)

        self.coin = ops.CoinFlip(probability=0.5)
        print('DALI "{0}" variant'.format(self.dali_device))

    def define_graph(self):
        rng = self.coin()
        self.jpegs, self.labels = self.input(name="Reader")

        # Combined decode & random crop
        images = self.decode(self.jpegs)

        # Resize as desired
        images = self.res(images)

        if self.dali_device == "gpu":
            output = self.cmn(images, mirror=rng)
        else:
            # CPU backend uses torch to apply mean & std
            output = self.flip(images, horizontal=rng)

        self.labels = self.labels.gpu()
        return [output, self.labels]


class HybridValPipe(Pipeline):
    """
    DALI Validation Pipeline
    Based on the official example: https://github.com/NVIDIA/DALI/blob/master/docs/examples/pytorch/resnet50/main.py
    In comparison to the example, the CPU backend does more computation on CPU, reducing GPU load & memory use.
    This dataloader implements ImageNet style validation preprocessing, namely:
    -resize to specified size
    -center crop to desired size
    batch_size (int): how many samples per batch to load
    num_threads (int): how many DALI workers to use for data loading.
    device_id (int): GPU device ID
    data_dir (str): Directory to dataset.  Format should be the same as torchvision dataloader,
        containing train & val subdirectories, with image class subfolders
    crop (int): Image output size (typically 224 for ImageNet)
    size (int): Resize size (typically 256 for ImageNet)
    mean (tuple): Image mean value for each channel
    std (tuple): Image standard deviation value for each channel
    local_rank (int, optional, default = 0) – Id of the part to read
    world_size (int, optional, default = 1) - Partition the data into this many parts (used for multiGPU training)
    dali_cpu (bool, optional, default = False) - Use DALI CPU mode instead of GPU
    shuffle (bool, optional, default = True) - Shuffle the dataset each epoch
    fp16 (bool, optional, default = False) - Output the data in fp16 instead of fp32 (GPU mode only)
    """

    def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size,
                 mean, std, local_rank=0, world_size=1, dali_cpu=False, shuffle=False, fp16=False):

        # As we're recreating the Pipeline at every epoch, the seed must be -1 (random seed)
        super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=-1)

        # Enabling read_ahead slowed down processing ~40%
        # Note: initial_fill is for the shuffle buffer.  As we only want to see every example once, this is set to 1
        self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, random_shuffle=shuffle, initial_fill=1)
        if dali_cpu:
            decode_device = "cpu"
            self.dali_device = "cpu"
            self.crop = ops.Crop(device="cpu", crop=(crop, crop))

        else:
            decode_device = "mixed"
            self.dali_device = "gpu"

            output_dtype = types.FLOAT
            if fp16:
                output_dtype = types.FLOAT16

            self.cmnp = ops.CropMirrorNormalize(device="gpu",
                                                output_dtype=output_dtype,
                                                output_layout=types.NCHW,
                                                crop=(crop, crop),
                                                image_type=types.RGB,
                                                mean=mean,
                                                std=std)

        self.decode = ops.ImageDecoder(device=decode_device, output_type=types.RGB)

        # Resize to desired size.  To match torchvision dataloader, use triangular interpolation
        self.res = ops.Resize(device=self.dali_device, resize_shorter=size, interp_type=types.INTERP_TRIANGULAR)

    def define_graph(self):
        self.jpegs, self.labels = self.input(name="Reader")
        images = self.decode(self.jpegs)
        images = self.res(images)
        if self.dali_device == 'gpu':
            output = self.cmnp(images)
        else:
            # CPU backend uses torch to apply mean & std
            output = self.crop(images)

        self.labels = self.labels.gpu()
        return [output, self.labels]


class DaliIterator():
    """
    Wrapper class to decode the DALI iterator output & provide iterator that functions the same as torchvision
    pipelines (Pipeline): DALI pipelines
    size (int): Number of examples in set
    Note: allow extra inputs to keep compatibility with CPU iterator
    """

    def __init__(self, pipelines, size, **kwargs):
        self._dali_iterator = DALIClassificationIterator(pipelines=pipelines, size=size)

    def __iter__(self):
        return self

    def __len__(self):
        return int(math.ceil(self._dali_iterator._size / self._dali_iterator.batch_size))


class DaliIteratorGPU(DaliIterator):
    """
    Wrapper class to decode the DALI iterator output & provide iterator that functions the same as torchvision
    pipelines (Pipeline): DALI pipelines
    size (int): Number of examples in set
    Note: allow extra inputs to keep compatibility with CPU iterator
    """

    def __next__(self):
        try:
            data = next(self._dali_iterator)
        except StopIteration:
            print('Resetting DALI loader')
            self._dali_iterator.reset()
            raise StopIteration

        # Decode the data output
        input = data[0]['data']
        target = data[0]['label'].squeeze().long()

        return input, target


def _preproc_worker(dali_iterator, cuda_stream, fp16, mean, std, output_queue, proc_next_input, done_event, pin_memory):
    """
    Worker function to parse DALI output & apply final pre-processing steps
    """

    while not done_event.is_set():
        # Wait until main thread signals to proc_next_input -- normally once it has taken the last processed input
        proc_next_input.wait()
        proc_next_input.clear()

        if done_event.is_set():
            print('Shutting down preproc thread')
            break

        try:
            data = next(dali_iterator)

            # Decode the data output
            input_orig = data[0]['data']
            target = data[0]['label'].squeeze().long()  # DALI should already output target on device

            # Copy to GPU and apply final processing in separate CUDA stream
            with torch.cuda.stream(cuda_stream):
                input = input_orig
                if pin_memory:
                    input = input.pin_memory()
                    del input_orig  # Save memory
                input = input.cuda(non_blocking=True)

                input = input.permute(0, 3, 1, 2)

                # Input tensor is kept as 8-bit integer for transfer to GPU, to save bandwidth
                if fp16:
                    input = input.half()
                else:
                    input = input.float()

                input = input.sub_(mean).div_(std)

            # Put the result on the queue
            output_queue.put((input, target))

        except StopIteration:
            print('Resetting DALI loader')
            dali_iterator.reset()
            output_queue.put(None)


class DaliIteratorCPU(DaliIterator):
    """
    Wrapper class to decode the DALI iterator output & provide iterator that functions the same as torchvision
    Note that permutation to channels first, converting from 8 bit to float & normalization are all performed on GPU
    pipelines (Pipeline): DALI pipelines
    size (int): Number of examples in set
    fp16 (bool): Use fp16 as output format, f32 otherwise
    mean (tuple): Image mean value for each channel
    std (tuple): Image standard deviation value for each channel
    pin_memory (bool): Transfer input tensor to pinned memory, before moving to GPU
    """
    def __init__(self, fp16=False, mean=(0., 0., 0.), std=(1., 1., 1.), pin_memory=True, **kwargs):
        super().__init__(**kwargs)
        print('Using DALI CPU iterator')
        self.stream = torch.cuda.Stream()

        self.fp16 = fp16
        self.mean = torch.tensor(mean).cuda().view(1, 3, 1, 1)
        self.std = torch.tensor(std).cuda().view(1, 3, 1, 1)
        self.pin_memory = pin_memory

        if self.fp16:
            self.mean = self.mean.half()
            self.std = self.std.half()

        self.proc_next_input = Event()
        self.done_event = Event()
        self.output_queue = queue.Queue(maxsize=5)
        self.preproc_thread = threading.Thread(
            target=_preproc_worker,
            kwargs={'dali_iterator': self._dali_iterator, 'cuda_stream': self.stream, 'fp16': self.fp16, 'mean': self.mean, 'std': self.std, 'proc_next_input': self.proc_next_input, 'done_event': self.done_event, 'output_queue': self.output_queue, 'pin_memory': self.pin_memory})
        self.preproc_thread.daemon = True
        self.preproc_thread.start()

        self.proc_next_input.set()

    def __next__(self):
        torch.cuda.current_stream().wait_stream(self.stream)
        data = self.output_queue.get()
        self.proc_next_input.set()
        if data is None:
            raise StopIteration
        return data

    def __del__(self):
        self.done_event.set()
        self.proc_next_input.set()
        torch.cuda.current_stream().wait_stream(self.stream)
        self.preproc_thread.join()


class DaliIteratorCPUNoPrefetch(DaliIterator):
    """
    Wrapper class to decode the DALI iterator output & provide iterator that functions the same as torchvision
    Note that permutation to channels first, converting from 8 bit to float & normalization are all performed on GPU
    pipelines (Pipeline): DALI pipelines
    size (int): Number of examples in set
    fp16 (bool): Use fp16 as output format, f32 otherwise
    mean (tuple): Image mean value for each channel
    std (tuple): Image standard deviation value for each channel
    pin_memory (bool): Transfer input tensor to pinned memory, before moving to GPU
    """
    def __init__(self, fp16, mean, std, pin_memory=True, **kwargs):
        super().__init__(**kwargs)
        print('Using DALI CPU iterator')

        self.stream = torch.cuda.Stream()

        self.fp16 = fp16
        self.mean = torch.tensor(mean).cuda().view(1, 3, 1, 1)
        self.std = torch.tensor(std).cuda().view(1, 3, 1, 1)
        self.pin_memory = pin_memory

        if self.fp16:
            self.mean = self.mean.half()
            self.std = self.std.half()

    def __next__(self):
        data = next(self._dali_iterator)

        # Decode the data output
        input = data[0]['data']
        target = data[0]['label'].squeeze().long()  # DALI should already output target on device

        # Copy to GPU & apply final processing in seperate CUDA stream
        input = input.cuda(non_blocking=True)

        input = input.permute(0, 3, 1, 2)

        # Input tensor is transferred to GPU as 8 bit, to save bandwidth
        if self.fp16:
            input = input.half()
        else:
            input = input.float()

        input = input.sub_(self.mean).div_(self.std)
        return input, target

ImportError: Please install DALI from https://www.github.com/NVIDIA/DALI to run this example.

In [6]:
import os, gc, time
import numpy as np
import torch
import importlib

import torchvision.transforms as transforms
import torchvision.datasets as datasets

def clear_memory(verbose=False):
    stt = time.time()
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()  # https://forums.fast.ai/t/clearing-gpu-memory-pytorch/14637
    gc.collect()

    if verbose:
        print('Cleared memory.  Time taken was %f secs' % (time.time() - stt))


class Dataset():
    """
    Pytorch Dataloader, with torchvision or Nvidia DALI CPU/GPU pipelines.
    This dataloader implements ImageNet style training preprocessing, namely:
    -random resized crop
    -random horizontal flip
    And ImageNet style validation preprocessing, namely:
    -resize to specified size
    -center crop to desired size
    data_dir (str): Directory to dataset.  Format should be the same as torchvision dataloader,
    batch_size (int): how many samples per batch to load
    size (int): Output size (typically 224 for ImageNet)
    val_size (int): Validation pipeline resize size (typically 256 for ImageNet)
    workers (int): how many workers to use for data loading
    world_size (int, optional, default = 1) - Partition the data into this many parts (used for multiGPU training)
    cuda (bool): Output tensors on CUDA, CPU otherwise
    use_dali (bool): Use Nvidia DALI backend, torchvision otherwise
    dali_cpu (bool): Use Nvidia DALI cpu backend, GPU backend otherwise
    fp16 (bool, optional, default = False) - Output the data in fp16 instead of fp32
    mean (tuple): Image mean value for each channel
    std (tuple): Image standard deviation value for each channel
    pin_memory (bool): Transfer CPU tensor to pinned memory before transfer to GPU (torchvision only)
    pin_memory_dali (bool): Transfer CPU tensor to pinned memory before transfer to GPU (dali only)
    """

    def __init__(self,
                 data_dir,
                 batch_size,
                 size=224,
                 val_batch_size=None,
                 val_size=256,
                 min_crop_size=0.08,
                 workers=4,
                 world_size=1,
                 cuda=True,
                 use_dali=False,
                 dali_cpu=True,
                 fp16=False,
                 mean=(0.485 * 255, 0.456 * 255, 0.406 * 255),
                 std=(0.229 * 255, 0.224 * 255, 0.225 * 255),
                 pin_memory=True,
                 pin_memory_dali=False,
                 ):

            self.batch_size = batch_size
            self.size = size
            self.val_batch_size = val_batch_size
            self.min_crop_size = min_crop_size
            self.workers = workers
            self.world_size = world_size
            self.cuda = cuda
            self.use_dali = use_dali
            self.dali_cpu = dali_cpu
            self.fp16 = fp16
            self.mean = mean
            self.std = std
            self.pin_memory = pin_memory
            self.pin_memory_dali = pin_memory_dali

            self.val_size = val_size
            if self.val_size is None:
                self.val_size = self.size

            if self.val_batch_size is None:
                self.val_batch_size = self.batch_size

            # Data loading code
            self.traindir = "../../storage/inpainting-dataset"
#             self.valdir = os.path.join(data_dir, 'val')

            # DALI Dataloader
            if self.use_dali:
                print('Using Nvidia DALI dataloader')
#                 assert len(datasets.ImageFolder(self.valdir)) % self.val_batch_size == 0, 'Validation batch size must divide validation dataset size cleanly...  DALI has problems otherwise.'
                self._build_dali_pipeline()

            # Standard torchvision dataloader
            else:
                print('Using torchvision dataloader')
                self._build_torchvision_pipeline()


    def _build_torchvision_pipeline(self):
        preproc_train = [transforms.RandomResizedCrop(self.size, scale=(self.min_crop_size, 1.0)),
                         transforms.RandomHorizontalFlip(),
                         ]

        preproc_val = [transforms.Resize(self.val_size),
                       transforms.CenterCrop(self.size),
                       ]

        train_dataset = datasets.ImageFolder(self.traindir, transforms.Compose(preproc_train))
        val_dataset = datasets.ImageFolder(self.valdir, transforms.Compose(preproc_val))

        self.train_sampler = None
        self.val_sampler = None

        if self.world_size > 1:
            raise NotImplementedError('distributed support not tested yet...')
            self.train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
            self.val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)

        self.train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=self.batch_size, shuffle=(self.train_sampler is None),
            num_workers=self.workers, pin_memory=self.pin_memory, sampler=self.train_sampler, collate_fn=fast_collate)

        self.val_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=self.val_batch_size, shuffle=False, num_workers=self.workers,
            pin_memory=self.pin_memory, sampler=self.val_sampler, collate_fn=fast_collate)

    def _build_dali_pipeline(self, val_on_cpu=True):
        assert self.world_size == 1, 'Distributed support not tested yet'

        iterator_train = DaliIteratorGPU
        if self.dali_cpu:
            iterator_train = DaliIteratorCPU

        self.train_pipe = HybridTrainPipe(batch_size=self.batch_size, num_threads=self.workers, device_id=0,
                                          data_dir=self.traindir, crop=self.size, dali_cpu=self.dali_cpu,
                                          mean=self.mean, std=self.std, local_rank=0,
                                          world_size=self.world_size, shuffle=True, fp16=self.fp16, min_crop_size=self.min_crop_size)

        self.train_pipe.build()
        self.train_loader = iterator_train(pipelines=self.train_pipe, size=self.get_nb_train() / self.world_size, fp16=self.fp16, mean=self.mean, std=self.std, pin_memory=self.pin_memory_dali)

#         iterator_val = DaliIteratorGPU
#         if val_on_cpu:
#             iterator_val = DaliIteratorCPU

#         self.val_pipe = HybridValPipe(batch_size=self.val_batch_size, num_threads=self.workers, device_id=0,
#                                       data_dir=self.valdir, crop=self.size, size=self.val_size, dali_cpu=val_on_cpu,
#                                       mean=self.mean, std=self.std, local_rank=0,
#                                       world_size=self.world_size, shuffle=False, fp16=self.fp16)

#         self.val_pipe.build()
#         self.val_loader = iterator_val(pipelines=self.val_pipe, size=self.get_nb_val() / self.world_size, fp16=self.fp16, mean=self.mean, std=self.std, pin_memory=self.pin_memory_dali)

    def _get_torchvision_loader(self, loader):
        return TorchvisionIterator(loader=loader,
                                   cuda=self.cuda,
                                   fp16=self.fp16,
                                   mean=self.mean,
                                   std=self.std,
                                   )

    def get_train_loader(self):
        """
        Creates & returns an iterator for the training dataset
        :return: Dataset iterator object
        """
        if self.use_dali:
            return self.train_loader
        return self._get_torchvision_loader(loader=self.train_loader)

    def get_val_loader(self):
        """
        Creates & returns an iterator for the training dataset
        :return: Dataset iterator object
        """
        if self.use_dali:
            return self.val_loader
        return self._get_torchvision_loader(loader=self.val_loader)

    def get_nb_train(self):
        """
        :return: Number of training examples
        """
        if self.use_dali:
            return int(self.train_pipe.epoch_size("Reader"))
        return len(datasets.ImageFolder(self.traindir))

    def get_nb_val(self):
        """
        :return: Number of validation examples
        """
        if self.use_dali:
            return int(self.val_pipe.epoch_size("Reader"))
        return len(datasets.ImageFolder(self.valdir))

    def prep_for_val(self):
        self.reset(val_on_cpu=False)

    # This is needed only for DALI
    def reset(self, val_on_cpu=True):
        if self.use_dali:
            clear_memory()

            # Currently we need to delete & rebuild the dali pipeline every epoch,
            # due to a memory leak somewhere in DALI
            print('Recreating DALI dataloaders to reduce memory usage')
            del self.train_loader, self.val_loader, self.train_pipe, self.val_pipe
            clear_memory()

            # taken from: https://stackoverflow.com/questions/1254370/reimport-a-module-in-python-while-interactive
            importlib.reload(dali)
            from dali import HybridTrainPipe, HybridValPipe, DaliIteratorCPU, DaliIteratorGPU

            self._build_dali_pipeline(val_on_cpu=val_on_cpu)

    def set_train_batch_size(self, train_batch_size):
        self.batch_size = train_batch_size
        if self.use_dali:
            del self.train_loader, self.val_loader, self.train_pipe, self.val_pipe
            self._build_dali_pipeline()
        else:
            del self.train_sampler, self.val_sampler, self.train_loader, self.val_loader
            self._build_torchvision_pipeline()

    def get_nb_classes(self):
        """
        :return: The number of classes in the dataset - as indicated by the validation dataset
        """
        return len(datasets.ImageFolder(self.valdir).classes)


def fast_collate(batch):
    """Convert batch into tuple of X and Y tensors."""
    imgs = [img[0] for img in batch]
    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
    w = imgs[0].size[0]
    h = imgs[0].size[1]
    tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
    for i, img in enumerate(imgs):
        nump_array = np.asarray(img, dtype=np.uint8)
        if (nump_array.ndim < 3):
            nump_array = np.expand_dims(nump_array, axis=-1)
        nump_array = np.rollaxis(nump_array, 2)

        tensor[i] += torch.from_numpy(nump_array)

    return tensor, targets


class TorchvisionIterator():
    """
    Iterator to perform final data pre-processing steps:
    -transfer to device (done on 8 bit tensor to reduce bandwidth requirements)
    -convert to fp32/fp16 tensor
    -apply mean/std scaling
    loader (DataLoader): Torchvision Dataloader
    cuda (bool): Transfer tensor to CUDA device
    fp16 (bool): Convert tensor to fp16 instead of fp32
    mean (tuple): Image mean value for each channel
    std (tuple): Image standard deviation value for each channel
    """
    def __init__(self,
                 loader,
                 cuda=False,
                 fp16=False,
                 mean=(0., 0., 0.),
                 std=(1., 1., 1.),
                 ):
        print('Using Torchvision iterator')
        self.loader = iter(loader)
        self.cuda = cuda
        self.mean = torch.tensor(mean).view(1, 3, 1, 1)
        self.std = torch.tensor(std).view(1, 3, 1, 1)
        self.fp16 = fp16

        if self.cuda:
            self.mean = self.mean.cuda()
            self.std = self.std.cuda()

        if self.fp16:
            self.mean = self.mean.half()
            self.std = self.std.half()

    def __iter__(self):
        return self

    def __next__(self):
        input, target = next(self.loader)

        if self.cuda:
            input = input.cuda()
            target = target.cuda()

        if self.fp16:
            input = input.half()
        else:
            input = input.float()

        input = input.sub_(self.mean).div_(self.std)

        return input, target

    def __len__(self):
        return len(self.loader)

In [7]:
dataset = Dataset(data_dir="./",
                      batch_size=16,
                      val_batch_size=16,
                      workers=4,
                      world_size=1,
                      use_dali=True,
                      dali_cpu=True,
                      fp16=True,
                      )


Using Nvidia DALI dataloader
DALI "cpu" variant




Using DALI CPU iterator


In [17]:
loader = dataset.get_train_loader()

In [None]:
start = time.time()
for i, data in enumerate(loader):
    images = data[0].cuda(non_blocking=True)
    labels = data[1].cuda(non_blocking=True)
end = time.time()
test_time = end-start
print('[DALI] end test dataloader iteration')
    # print('[DALI] iteration time: %fs [train],  %fs [test]' % (train_time, test_time))
print('[DALI] iteration time: %fs [test]' % (test_time))