In [1]:
import argparse
import os
import shutil
import time
import math

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import numpy as np

from nvidia.dali.plugin.pytorch import DALIClassificationIterator, LastBatchPolicy
from nvidia.dali.pipeline import pipeline_def
import nvidia.dali.types as types
import nvidia.dali.fn as fn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
global DDP, amp, optimizers, parallel
from apex.parallel import DistributedDataParallel as DDP
from apex import amp, optimizers, parallel

In [3]:
def to_python_float(t):
    if hasattr(t, "item"):
        return t.item()
    else:
        return t[0]


@pipeline_def
def create_dali_pipeline(
    data_dir, crop, size, shard_id, num_shards, dali_cpu=False, is_training=True
):
    images, labels = fn.readers.caffe(
        path=data_dir,
        shard_id=shard_id,
        num_shards=num_shards,
        random_shuffle=is_training,
        pad_last_batch=True,
        name="Reader",
    )

    dali_device = "cpu" if dali_cpu else "gpu"
    decoder_device = "cpu" if dali_cpu else "mixed"
    # ask nvJPEG to preallocate memory for the biggest sample in ImageNet for CPU and GPU to avoid reallocations in runtime
    device_memory_padding = 211025920 if decoder_device == "mixed" else 0
    host_memory_padding = 140544512 if decoder_device == "mixed" else 0
    # ask HW NVJPEG to allocate memory ahead for the biggest image in the data set to avoid reallocations in runtime
    preallocate_width_hint = 5980 if decoder_device == "mixed" else 0
    preallocate_height_hint = 6430 if decoder_device == "mixed" else 0
    if is_training:
        images = fn.decoders.image_random_crop(
            images,
            device=decoder_device,
            output_type=types.RGB,
            device_memory_padding=device_memory_padding,
            host_memory_padding=host_memory_padding,
            preallocate_width_hint=preallocate_width_hint,
            preallocate_height_hint=preallocate_height_hint,
            random_aspect_ratio=[0.8, 1.25],
            random_area=[0.1, 1.0],
            num_attempts=100,
        )
        images = fn.resize(
            images,
            device=dali_device,
            resize_x=crop,
            resize_y=crop,
            interp_type=types.INTERP_TRIANGULAR,
        )
        mirror = fn.random.coin_flip(probability=0.5)
    else:
        images = fn.decoders.image(images, device=decoder_device, output_type=types.RGB)
        images = fn.resize(
            images,
            device=dali_device,
            size=size,
            mode="not_smaller",
            interp_type=types.INTERP_TRIANGULAR,
        )
        mirror = False

    images = fn.crop_mirror_normalize(
        images.gpu(),
        dtype=types.FLOAT,
        output_layout="CHW",
        crop=(crop, crop),
        mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
        std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
        mirror=mirror,
    )
    labels = labels.gpu()
    return images, labels

In [4]:
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()

    for i, data in enumerate(train_loader):
        input = data[0]["data"]
        target = data[0]["label"].squeeze(-1).long()
        train_loader_len = int(math.ceil(train_loader._size / batch_size))

        if prof >= 0 and i == prof:
            print("Profiling begun at iteration {}".format(i))
            torch.cuda.cudart().cudaProfilerStart()

        if prof >= 0:
            torch.cuda.nvtx.range_push("Body of iteration {}".format(i))

        adjust_learning_rate(optimizer, epoch, i, train_loader_len, lr)
        if test:
            if i > 10:
                break

        # compute output
        if prof >= 0:
            torch.cuda.nvtx.range_push("forward")
        output = model(input)
        if prof >= 0:
            torch.cuda.nvtx.range_pop()
        loss = criterion(output, target)

        # compute gradient and do SGD step
        optimizer.zero_grad()

        if prof >= 0:
            torch.cuda.nvtx.range_push("backward")
        if opt_level is not None:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        if prof >= 0:
            torch.cuda.nvtx.range_pop()

        if prof >= 0:
            torch.cuda.nvtx.range_push("optimizer.step()")
        optimizer.step()
        if prof >= 0:
            torch.cuda.nvtx.range_pop()

        if i % print_freq == 0:
            # Every print_freq iterations, check the loss, accuracy, and speed.
            # For best performance, it doesn't make sense to print these metrics every
            # iteration, since they incur an allreduce and some host<->device syncs.

            # Measure accuracy
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

            # Average loss and accuracy across processes for logging
            if distributed:
                reduced_loss = reduce_tensor(loss.data)
                prec1 = reduce_tensor(prec1)
                prec5 = reduce_tensor(prec5)
            else:
                reduced_loss = loss.data

            # to_python_float incurs a host<->device sync
            losses.update(to_python_float(reduced_loss), input.size(0))
            top1.update(to_python_float(prec1), input.size(0))
            top5.update(to_python_float(prec5), input.size(0))

            torch.cuda.synchronize()
            batch_time.update((time.time() - end) / print_freq)
            end = time.time()

            if local_rank == 0:
                print(
                    "Epoch: [{0}][{1}/{2}]\t"
                    "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                    "Speed {3:.3f} ({4:.3f})\t"
                    "Loss {loss.val:.10f} ({loss.avg:.4f})\t"
                    "Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t"
                    "Prec@5 {top5.val:.3f} ({top5.avg:.3f})".format(
                        epoch,
                        i,
                        train_loader_len,
                        world_size * batch_size / batch_time.val,
                        world_size * batch_size / batch_time.avg,
                        batch_time=batch_time,
                        loss=losses,
                        top1=top1,
                        top5=top5,
                    )
                )

        # Pop range "Body of iteration {}".format(i)
        if prof >= 0:
            torch.cuda.nvtx.range_pop()

        if prof >= 0 and i == prof + 10:
            print("Profiling ended at iteration {}".format(i))
            torch.cuda.cudart().cudaProfilerStop()
            quit()

    return batch_time.avg


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()

    for i, data in enumerate(val_loader):
        input = data[0]["data"]
        target = data[0]["label"].squeeze(-1).long()
        val_loader_len = int(val_loader._size / batch_size)

        # compute output
        with torch.no_grad():
            output = model(input)
            loss = criterion(output, target)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        if distributed:
            reduced_loss = reduce_tensor(loss.data)
            prec1 = reduce_tensor(prec1)
            prec5 = reduce_tensor(prec5)
        else:
            reduced_loss = loss.data

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # TODO:  Change timings to mirror train().
        if local_rank == 0 and i % print_freq == 0:
            print(
                "Test: [{0}/{1}]\t"
                "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                "Speed {2:.3f} ({3:.3f})\t"
                "Loss {loss.val:.4f} ({loss.avg:.4f})\t"
                "Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t"
                "Prec@5 {top5.val:.3f} ({top5.avg:.3f})".format(
                    i,
                    val_loader_len,
                    world_size * batch_size / batch_time.val,
                    world_size * batch_size / batch_time.avg,
                    batch_time=batch_time,
                    loss=losses,
                    top1=top1,
                    top5=top5,
                )
            )

    print(" * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}".format(top1=top1, top5=top5))

    return [top1.avg, top5.avg]


def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, "model_best.pth.tar")


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch, step, len_epoch, lr):
    factor = epoch // 30

    if epoch >= 80:
        factor = factor + 1

    lr = lr * (0.1**factor)

    """Warmup"""
    if epoch < 5:
        lr = lr * float(1 + step + epoch * len_epoch) / (5.0 * len_epoch)

    for param_group in optimizer.param_groups:
        param_group["lr"] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= world_size
    return rt

In [5]:
opt_level = "O3"
keep_batchnorm_fp32 = True
loss_scale = "dynamic"
batch_size = 256
sync_bn = True
arch = "resnet50"
channels_last = False
start_epoch = 0
epochs = 100
lr = 0.01
prof = -1
local_rank = 0
workers = 4
momentum = 0.9
weight_decay = 1e-4
print_freq = 10
resume = ""
data_root = os.environ["DALI_EXTRA_PATH"]
traindir = os.path.join(data_root, "db", "lmdb")
valdir = os.path.join(data_root, "db", "lmdb")
test = False
distributed = False
dali_cpu = False

In [6]:
cudnn.benchmark = True
best_prec1 = 0
gpu = 1
world_size = 2
total_batch_size = world_size * batch_size

In [7]:
print("=> opt_level = {}".format(opt_level))
print(
    "=> keep_batchnorm_fp32 = {}".format(keep_batchnorm_fp32), type(keep_batchnorm_fp32)
)
print("=> loss_scale = {}".format(loss_scale), type(loss_scale))
print("=> CUDNN VERSION: {}".format(torch.backends.cudnn.version()))
print("=> creating model '{}'".format(arch))
model = models.__dict__[arch]()

if sync_bn:
    print("=> using apex synced BN")
    model = parallel.convert_syncbn_model(model)

if hasattr(torch, "channels_last") and hasattr(torch, "contiguous_format"):
    if channels_last:
        memory_format = torch.channels_last
    else:
        memory_format = torch.contiguous_format
    model = model.cuda().to(memory_format=memory_format)
else:
    model = model.cuda()

lr = lr * float(batch_size * world_size) / 256.0
optimizer = torch.optim.SGD(
    model.parameters(), lr, momentum=momentum, weight_decay=weight_decay
)

=> opt_level = O3
=> keep_batchnorm_fp32 = True <class 'bool'>
=> loss_scale = dynamic <class 'str'>
=> CUDNN VERSION: 8700
=> creating model 'resnet50'
=> using apex synced BN




In [8]:
model, optimizer = amp.initialize(
    model,
    optimizer,
    opt_level=opt_level,
    keep_batchnorm_fp32=keep_batchnorm_fp32,
    loss_scale=loss_scale,
)

criterion = nn.CrossEntropyLoss().cuda()

Selected optimization level O3:  Pure FP16 training.
Defaults for this optimization level are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : False
loss_scale             : dynamic




In [9]:
crop_size = 224
val_size = 256

In [10]:
pipe = create_dali_pipeline(
    batch_size=batch_size,
    num_threads=workers,
    device_id=local_rank,
    seed=12 + local_rank,
    data_dir=traindir,
    crop=crop_size,
    size=val_size,
    dali_cpu=False,
    shard_id=local_rank,
    num_shards=world_size,
    is_training=True,
)
pipe.build()
train_loader = DALIClassificationIterator(
    pipe, reader_name="Reader", last_batch_policy=LastBatchPolicy.PARTIAL
)

pipe = create_dali_pipeline(
    batch_size=batch_size,
    num_threads=workers,
    device_id=local_rank,
    seed=12 + local_rank,
    data_dir=valdir,
    crop=crop_size,
    size=val_size,
    dali_cpu=False,
    shard_id=local_rank,
    num_shards=world_size,
    is_training=False,
)
pipe.build()
val_loader = DALIClassificationIterator(
    pipe, reader_name="Reader", last_batch_policy=LastBatchPolicy.PARTIAL
)

In [11]:
a = 456.6489754575675
print("{0:.2f}%".format(a))

456.65%


In [12]:
total_time = AverageMeter()
for epoch in range(start_epoch, epochs):
    # train for one epoch
    avg_train_time = train(train_loader, model, criterion, optimizer, epoch)
    total_time.update(avg_train_time)
    if test:
        break

    # evaluate on validation set
    [prec1, prec5] = validate(val_loader, model, criterion)

    # remember best prec@1 and save checkpoint
    if local_rank == 0:
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "arch": arch,
                "state_dict": model.state_dict(),
                "best_prec1": best_prec1,
                "optimizer": optimizer.state_dict(),
            },
            is_best,
        )
        if epoch == epochs - 1:
            print(
                "##Top-1 {0:.2f}%\n"
                "##Top-5 {1:.2f}%\n"
                "##Perf  {2:,.2f}".format(
                    prec1, prec5, total_batch_size / total_time.avg
                )
            )

    train_loader.reset()
    val_loader.reset()

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Epoch: [0][0/1]	Time 0.130 (0.130)	Speed 3948.988 (3948.988)	Loss 7.0319161415 (7.0319)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Test: [0/0]	Time 0.011 (0.011)	Speed 48175.782 (48175.782)	Loss 7.0695 (7.0695)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
 * Prec@1 0.000 Prec@5 0.000
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Epoch: [1][0/1]	Time 0.084 (0.084)	Speed 6130.367 (6130.367)	Loss 6.9391155243 (6.9391)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Test: [0/0]	Time 0.011 (0.011)	Speed 46009.291 (46009.291)	Loss 6.8918 (6.8918)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
 * Prec@1 0.000 Prec@5 0.000
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Epoch: [2][0/1]	Time 0.006 (0.006)	Speed 80642.427 (80642.427)	Loss 7.0344219208 (7.0344)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Test: [0/0]	Time 0.011 (0.011)	Speed 47808.977 (47808.977)	Loss 6.8