In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import argparse
import time
import torch.backends.cudnn as cudnn

Model

In [2]:
## Basic block
class BasicBlock(nn.Module):
  expansion = 1

  def __init__(self, input_channels, out_channels, stride = 1):
    super(BasicBlock, self).__init__()
    self.conv1 = nn.Conv2d(input_channels, out_channels, kernel_size = 3, stride = stride, padding = 1, bias = False)
    self.bn1 = nn.BatchNorm2d(out_channels)
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1, bias = False)
    self.bn2 = nn.BatchNorm2d(out_channels)

    self.shortcut = nn.Sequential()
    # when stride != 1 or input_channels != out_channels, it means the width and height are different
    if stride != 1 or input_channels != self.expansion * out_channels:
      self.shortcut = nn.Sequential(
          nn.Conv2d(input_channels, self.expansion * out_channels, kernel_size = 1, stride = stride, bias = False),
          nn.BatchNorm2d(self.expansion * out_channels)
      )

  def forward(self, x):
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.bn2(self.conv2(out))
    out += self.shortcut(x)
    out = F.relu(out)
    return out

## ResNet
class ResNet(nn.Module):
  def __init__(self, block, num_blocks, num_classes = 10):
    super(ResNet, self).__init__()
    self.input_channels = 64

    self.conv1 = nn.Conv2d(3, 64, kernel_size = 3, stride = 1, padding = 1, bias = False)
    self.bn1 = nn.BatchNorm2d(64)
    self.layer1 = self._make_layer(block, 64, num_blocks[0], stride = 1)
    self.layer2 = self._make_layer(block, 128, num_blocks[1], stride = 2)
    self.layer3 = self._make_layer(block, 256, num_blocks[2], stride = 2)
    self.layer4 = self._make_layer(block, 512, num_blocks[3], stride = 2)
    self.linear = nn.Linear(512 * block.expansion, num_classes)

  def _make_layer(self, block, out_channels, num_blocks, stride):
    strides = [stride] + [1] * (num_blocks - 1)
    layers = []
    for stride in strides:
      layers.append(block(self.input_channels, out_channels, stride))
      self.input_channels = out_channels * block.expansion
    return nn.Sequential(*layers)

  def forward(self, x):
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.layer1(out)
    out = self.layer2(out)
    out = self.layer3(out)
    out = self.layer4(out)
    out = F.avg_pool2d(out, 4)
    out = out.view(out.size(0), -1)
    out = self.linear(out)
    return out

def ResNet18():
  return ResNet(BasicBlock, [2,2,2,2])

In [3]:
net = ResNet18()

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [5]:
net = net.to(device)

In [6]:
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

In [7]:
loss_fn = nn.CrossEntropyLoss()

Data

In [8]:
trainsform_train = transforms.Compose([
    transforms.RandomCrop(32, padding = 4),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

trainsform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

train_set = torchvision.datasets.CIFAR10(root = './data', train=True, download=True, transform=trainsform_train)
test_set = torchvision.datasets.CIFAR10(root = './data', train=False, download=True, transform=trainsform_test)


classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


## Question 1

Measure how long does it take to compete 1 epoch training using different batch size on single GPU. Start from batch size 32, increase by 4-fold for each measurement (i.e., 32, 128, 512 ...) until single GPU memory cannot hold the batch size. For each run, run 2 epochs, the first epoch is used to warmup CPU/GPU cache; and you should report the training time ( excluding data I/O; but including data movement from CPU to GPU, gradients calculation and weights update) based on the 2nd epoch training. 

In [14]:
def train(epoch, batch_size, train_loss_history, train_acc_history):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    total_comm_time = 0
    total_compute_time = 0
    train_loader = torch.utils.data.DataLoader(train_set, batch_size = batch_size,shuffle = True, num_workers = 2)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size = batch_size, shuffle = True, num_workers = 2)

    total_train_time_start = time.time()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # t2 start
        comm_time_start = time.time()

        inputs, targets = inputs.to(device), targets.to(device)
        
        comm_time_end = time.time()
        total_comm_time += comm_time_end - comm_time_start
        
        
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
        compute_time_end = time.time()
        total_compute_time += compute_time_end - comm_time_start
       
        train_loss += loss.item()
        train_loss_history.append(loss.item())
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        train_acc_history.append(100. * correct / total)
        '''
        print("\nThe batch index: {0:d}, len of train loader: {1:d}, Loss: {2:.3f}, acc: {3:.3f}".format(batch_idx,
                                                                                             len(train_loader),
                                                                                             train_loss / (batch_idx + 1),
                                                                                             100. * correct / total))
        '''
    
    total_train_time_end = time.time()   
    total_train_time = total_train_time_end - total_train_time_start
    print("{0:d} epoch, total_train_time = {1:.3f}, compute_time = {2:.3f}, comm_time = {3:.3f}".format(epoch, total_train_time, total_compute_time, total_comm_time))
    
    

In [16]:
batch = [32]#, 128, 512, 2048]
for batch_size in batch:
  print("The batch size is  %d" % batch_size)
  epoch = 2
  train_loss_history = []
  train_acc_history = []
  net = ResNet18()
  net = net.to(device)
  if device == 'cuda':
      net = torch.nn.DataParallel(net)
      cudnn.benchmark = True
  optimizer = optim.Adam(net.parameters(), lr=0.01, betas = (0.9, 0.99), weight_decay=5e-4)
  for i in range(epoch):
    train(i, batch_size, train_loss_history, train_acc_history)
   

The batch size is  32

Epoch: 0
0 epoch, total_train_time = 130.140, compute_time = 66.367, comm_time = 0.751

Epoch: 1


KeyboardInterrupt: ignored