In [1]:
import time
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [2]:
# device = torch.device("cpu")
device = torch.device("cuda")
def sync_time():
  if device == torch.device("cuda"):
    torch.cuda.synchronize()

print(device)

cuda


In [3]:
# Resnet Model taken from open source git repo 
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=True
        )
        # self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(
            planes, planes, kernel_size=3, stride=1, padding=1, bias=True
        )
        # self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(
                    in_planes,
                    self.expansion * planes,
                    kernel_size=1,
                    stride=stride,
                    bias=True,
                )
                # nn.BatchNorm2d(self.expansion * planes),
            )

    def forward(self, x):
        # out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.conv1(x))
        # out = self.bn2(self.conv2(out))
        out = self.conv2(out)
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=True)
        # self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        # out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.conv1(x))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


def dataloader_split(data_path, num_workers):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    transform = transforms.Compose(
        [
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomCrop(32, padding=4),
            transforms.ToTensor(),
            normalize,
        ]
    )

    # Data
    data_train = datasets.CIFAR10(
        root=data_path, train=True, download=True, transform=transform
    )
    data_test = datasets.CIFAR10(
        root=data_path, train=False, download=True, transform=transform
    )
    # Loaders
    dataloader_train = DataLoader(data_train, batch_size=128, num_workers=0)
    dataloader_test = DataLoader(data_test, batch_size=100, num_workers=0)
    return dataloader_train, dataloader_test


def get_optimizer(name, params, lr, momentum, weight_decay):
    if name == "SGD":
        opt_kwargs = dict(lr=lr, momentum=momentum, weight_decay=weight_decay)
        return torch.optim.SGD(params, **opt_kwargs)
    elif name == "SGD_NESTEROV":
        opt_kwargs = dict(
            lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True
        )
        return torch.optim.SGD(params, **opt_kwargs)
    elif name == "ADAGRAD":
        opt_kwargs = dict(lr=lr, weight_decay=weight_decay)
        return torch.optim.Adagrad(params, **opt_kwargs)
    elif name == "ADADELTA":
        opt_kwargs = dict(lr=lr, weight_decay=weight_decay)
        return torch.optim.Adadelta(params, **opt_kwargs)
    elif name == "ADAM":
        opt_kwargs = dict(lr=lr, weight_decay=weight_decay)
        return torch.optim.Adam(params, **opt_kwargs)
    else:
        raise TypeError("Optimizer requested is not available")


def train_loop(use_cuda, device, model, loss_fn,
               optimizer, data_gen, steps, data_in_memory):
    data_loading_time = []
    training_time = []
    t0  = time.time()
    i = 0
    def _train_loop(images, labels):
        data_loading_time.append(time.time() - t_data_loading_start)
        # Training phase
        t_training_start = time.time()
        # Forward pass
        outputs = model(images)
        loss = loss_fn(outputs, labels)
        _, preds = torch.max(outputs, 1)
        # Backwards pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        training_accurary = torch.sum(preds == labels) / len(preds)
        sync_time()
        training_time.append(time.time() - t_training_start)
        return training_accurary, loss

    if data_in_memory:
      t_data_loading_start = time.time()
      for images, labels in data_gen:
        training_accurary, loss = _train_loop(images, labels)
        t_data_loading_start = time.time()
        i += 1
        if i % 25 == 0:
          print("PCT done: ", i/steps)
    else:
      while True:
        # Dataloading phase
        t_data_loading_start = time.time()
        try:
            images, labels = next(data_gen)
        except StopIteration:
            break  
        images = images.to(device)
        labels = labels.to(device)
        sync_time()
        training_accurary, loss = _train_loop(images, labels)
        i += 1
        if i % 25 == 0:
          print("PCT done: ", i/steps)
        if i >= steps:
          break    

    return loss,training_accurary, sum(training_time), sum(data_loading_time), time.time() - t0


In [4]:
class Args():
  data_path = "~/"
  cuda = "TRUE"
  num_workers = 2
  opt = "SGD"
  num_epochs = 1

args = Args()
num_workers = int(args.num_workers)
data_path = args.data_path
dataloader_train, dataloader_test = dataloader_split(data_path, 0)

use_cuda = args.cuda
opt_name = args.opt
learning_rate = 0.1
momentum = 0.9
decay = 5e-4
loss_fn = nn.CrossEntropyLoss()

def train_no_opt(model, device):
    optimizer = get_optimizer(
      opt_name, model.parameters(), learning_rate, momentum, decay
    )
    total_steps = len(dataloader_train)
    accuracy = []
    losses = []
    total_time = []
    training_time = []
    data_loading_time = []
    
    # Training model
    data_gen = iter(dataloader_train)
    loss, training_accurary, t_time, d_time, tot_time = train_loop(
        use_cuda,
        device,
        model,
        loss_fn,
        optimizer,
        data_gen,
        total_steps,
        False
    )

    # Keeping track of metrics
    losses.append(loss.item())
    accuracy.append(training_accurary)

    # Time measurements
    total_time.append(tot_time)
    training_time.append(t_time)
    data_loading_time.append(d_time)
    
    print(
        f"Total time: {sum(total_time)}\nTraining time: {sum(training_time)} "
        f"\nData-loading time: {sum(data_loading_time)}\n"
    )

Files already downloaded and verified
Files already downloaded and verified


In [5]:
def eval(model: nn.Module, dataloader: DataLoader, data_in_memory, subset=None) -> float:
    model.eval()
    correct = 0
    total = 0

    data_time = []
    inference_time = []
    i = 0
    with torch.no_grad():
        t0 = time.time()
        for data in dataloader:
            images, labels = data
            if not data_in_memory:
              images = images.to(device)
              labels = labels.to(device)
            sync_time()
            data_time.append(time.time() - t0)
            t1 = time.time()
            outputs = model(images)
            sync_time()
            inference_time.append(time.time() - t1)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            t0 = time.time()
            i += 1
            if subset is not None:
                if i >= subset:
                    break
    print("Data loading took:", sum(data_time))
    print("Forward took:", sum(inference_time))
    return 100 * correct / total

Profiling training of non optimized model

In [6]:
print(device)
model = ResNet18().to(device)
train_no_opt(model, device)

cuda
PCT done:  0.0639386189258312
PCT done:  0.1278772378516624
PCT done:  0.1918158567774936
PCT done:  0.2557544757033248
PCT done:  0.319693094629156
PCT done:  0.3836317135549872
PCT done:  0.4475703324808184
PCT done:  0.5115089514066496
PCT done:  0.5754475703324808
PCT done:  0.639386189258312
PCT done:  0.7033248081841432
PCT done:  0.7672634271099744
PCT done:  0.8312020460358056
PCT done:  0.8951406649616368
PCT done:  0.959079283887468
Total time: 60.14378309249878
Training time: 34.743096113204956 
Data-loading time: 25.358655214309692



Profiling inference of non optimized model





In [7]:
acc = eval(model, iter(dataloader_test), False)
acc

Data loading took: 5.096791982650757
Forward took: 2.1158342361450195


30.67

## 2. Performance optimizations

## 2.1 Dataloader optimization
Loading data in parallel and pinning the memory for faster data transfer between CPU and GPU.

In [8]:
def dataloader_split_opt(data_path, num_workers, device):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    transform = transforms.Compose(
        [
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomCrop(32, padding=4),
            transforms.ToTensor(),
            normalize,
        ]
    )

    # Data
    data_train = datasets.CIFAR10(
        root=data_path, train=True, download=True, transform=transform
    )
    data_test = datasets.CIFAR10(
        root=data_path, train=False, download=True, transform=transform
    )
    # Loaders
    dataloader_train = DataLoader(data_train,
                                  batch_size=128,
                                  num_workers=num_workers,
                                  pin_memory=True,
                                  )
    dataloader_test = DataLoader(data_test,
                                 batch_size=100,
                                 num_workers=num_workers,
                                 pin_memory=True,
                                 )
    return dataloader_train, dataloader_test

data_train_opt, data_test_opt = dataloader_split_opt(data_path, 4, device)

Files already downloaded and verified
Files already downloaded and verified


In [9]:
def train_dataloader_optimized(model, device, data):
    optimizer = get_optimizer(
      opt_name, model.parameters(), learning_rate, momentum, decay
    )
    total_steps = 391
    accuracy = []
    losses = []
    total_time = []
    training_time = []
    data_loading_time = []
    
    # Training model
    loss, training_accurary, t_time, d_time, tot_time = train_loop(
        use_cuda,
        device,
        model,
        loss_fn,
        optimizer,
        iter(data),
        total_steps,
        False
    )

    # Keeping track of metrics
    losses.append(loss.item())
    accuracy.append(training_accurary)

    # Time measurements
    total_time.append(tot_time)
    training_time.append(t_time)
    data_loading_time.append(d_time)
    
    print(
        f"Total time: {sum(total_time)}\nTraining time: {sum(training_time)} "
        f"\nData-loading time: {sum(data_loading_time)}\n"
    )

In [10]:
print(device)
opt_model = ResNet18().to(device)
train_dataloader_optimized(opt_model, device, data_train_opt)
acc = eval(model, iter(data_test_opt), False)
print(f"acc: {acc}%")

cuda
PCT done:  0.0639386189258312
PCT done:  0.1278772378516624
PCT done:  0.1918158567774936
PCT done:  0.2557544757033248
PCT done:  0.319693094629156
PCT done:  0.3836317135549872
PCT done:  0.4475703324808184
PCT done:  0.5115089514066496
PCT done:  0.5754475703324808
PCT done:  0.639386189258312
PCT done:  0.7033248081841432
PCT done:  0.7672634271099744
PCT done:  0.8312020460358056
PCT done:  0.8951406649616368
PCT done:  0.959079283887468
Total time: 34.125473737716675
Training time: 33.58079671859741 
Data-loading time: 0.4668581485748291

Data loading took: 0.15177607536315918
Forward took: 2.1219701766967773
acc: 30.82%


## 2.2 Torch.jit.script

In [11]:
torch.backends.cudnn.benchmark = True
# torch.jit.enable_onednn_fusion(True)
sample_data, _ = next(iter(dataloader_train))
sample_data = sample_data.to(device)
opt_model.eval()
opt_model_traced = torch.jit.trace(opt_model, sample_data)
opt_model_traced = torch.jit.freeze(opt_model_traced)


In [12]:
warmup_iters = 10
for i in range(warmup_iters):
    eval(opt_model_traced, iter(data_test_opt), False)

acc = eval(opt_model_traced, iter(data_test_opt), False)
print(f"acc: {acc}%")

Data loading took: 0.08617377281188965
Forward took: 0.055468082427978516
Data loading took: 0.08517956733703613
Forward took: 2.3111395835876465
Data loading took: 0.07864594459533691
Forward took: 0.02542567253112793
Data loading took: 0.08821630477905273
Forward took: 0.02588343620300293
Data loading took: 0.08326125144958496
Forward took: 0.024461984634399414
Data loading took: 0.08291435241699219
Forward took: 0.024697065353393555
Data loading took: 0.0662088394165039
Forward took: 0.02559661865234375
Data loading took: 0.07740545272827148
Forward took: 0.025318622589111328
Data loading took: 0.07469415664672852
Forward took: 0.02451491355895996
Data loading took: 0.07565593719482422
Forward took: 0.02480483055114746
Data loading took: 0.1486215591430664
Forward took: 2.0996038913726807
acc: 35.28%


## 2.3 Quantization

In [15]:
opt_model.eval()
quantized_model = torch.quantization.quantize_dynamic(
    opt_model, {torch.nn.Linear, torch.nn.Conv2d}, dtype=torch.qint8
)
quantized_model_traced = torch.jit.trace(quantized_model, sample_data)
quantized_model_traced = torch.jit.freeze(quantized_model_traced)

In [16]:
warmup_iters = 5
for i in range(warmup_iters):
    eval(quantized_model_traced, iter(data_test_opt), False)

acc = eval(quantized_model_traced, iter(data_test_opt), False)
print(f"acc: {acc}%")

NotImplementedError: Could not run 'quantized::linear_dynamic' with arguments from the 'CUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::linear_dynamic' is only available for these backends: [CPU, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PythonDispatcher].

CPU: registered at ../aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp:656 [kernel]
BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:140 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:488 [backend fallback]
Functionalize: registered at ../aten/src/ATen/FunctionalizeFallbackKernel.cpp:291 [backend fallback]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at ../aten/src/ATen/ConjugateFallback.cpp:18 [backend fallback]
Negative: registered at ../aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at ../aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:64 [backend fallback]
AutogradOther: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:35 [backend fallback]
AutogradCPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:39 [backend fallback]
AutogradCUDA: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:47 [backend fallback]
AutogradXLA: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:51 [backend fallback]
AutogradMPS: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:59 [backend fallback]
AutogradXPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:43 [backend fallback]
AutogradHPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:68 [backend fallback]
AutogradLazy: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:55 [backend fallback]
Tracer: registered at ../torch/csrc/autograd/TraceTypeManual.cpp:296 [backend fallback]
AutocastCPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:482 [backend fallback]
AutocastCUDA: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:324 [backend fallback]
FuncTorchBatched: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:743 [backend fallback]
FuncTorchVmapMode: fallthrough registered at ../aten/src/ATen/functorch/VmapModeRegistrations.cpp:28 [backend fallback]
Batched: registered at ../aten/src/ATen/BatchingRegistrations.cpp:1064 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at ../aten/src/ATen/functorch/TensorWrapper.cpp:189 [backend fallback]
PythonTLSSnapshot: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:148 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:484 [backend fallback]
PythonDispatcher: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:144 [backend fallback]
