<a href="https://colab.research.google.com/github/yeb2Binfang/ECE-GY9143HPML/blob/main/Lab/Lab2/lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load dataset

We will use CIFAR10, which contains 50K 32 x 32 color images

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import argparse
import time
%matplotlib inline

In [2]:
trainsform_train = transforms.Compose([
    transforms.RandomCrop(32, padding = 4),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

In [3]:
trainsform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

In [4]:
train_set = torchvision.datasets.CIFAR10(root = './data', train=True, download=True, transform=trainsform_train)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data


In [5]:
test_set = torchvision.datasets.CIFAR10(root = './data', train=False, download=True, transform=trainsform_test)

Files already downloaded and verified


In [6]:
batch_size = 128
train_loader = torch.utils.data.DataLoader(train_set, batch_size = batch_size,shuffle = True, num_workers = 2)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = batch_size, shuffle = True, num_workers = 2)

In [7]:
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Build model

Create ResNet18.
Specifically, The first convolutional layer should have 3 input channels, 64 output channels, 3x3 kernel, with stride=1 and padding=1. Followed by 8 basic blocks in 4 subgroups (i.e. 2 basic blocks in each subgroup):
1. The first sub-group contains a convolutional layer with 64 output channels, 3x3 kernel, stride=1, padding=1.
2. The second sub-group contains a convolutional layer with 128 output channels, 3x3 kernel, stride=2, padding=1.
3. The third sub-group contains a convolutional layer with 256 output channels, 3x3 kernel, stride=2, padding=1.
4. The fourth sub-group contains a convolutional layer with 512 output channels, 3x3 kernel, stride=2, padding=1.
5. The final linear layer is of 10 output classes. For all convolutional layers, use RELU activation functions, and use batch normal layers to avoid covariant shift. Since batch-norm layers regularize the training, set the bias to 0 for all the convolutional layers. Use SGD optimizers with 0.1 as the learning rate, momentum 0.9, weight decay 5e-4. The loss function is cross-entropy.

For all convolutional layers, use RELU activation functions, and use batch normal layers to avoid covariant shift. Since batch-norm layers regularize the training, set the bias to 0 for all the convolutional layers. 


In [8]:
class BasicBlock(nn.Module):
  expansion = 1
  
  def __init__(self, input_channels, out_channels, stride = 1):
    super(BasicBlock, self).__init__()
    self.conv1 = nn.Conv2d(input_channels, out_channels, kernel_size = 3, stride = stride, padding = 1, bias = False)
    self.bn1 = nn.BatchNorm2d(out_channels)
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1, bias = False)
    self.bn2 = nn.BatchNorm2d(out_channels)

    self.shortcut = nn.Sequential()
    # when stride != 1 or input_channels != out_channels, it means the width and height are different
    if stride != 1 or input_channels != self.expansion * out_channels:
      self.shortcut = nn.Sequential(
          nn.Conv2d(input_channels, self.expansion * out_channels, kernel_size = 1, stride = stride, bias = False),
          nn.BatchNorm2d(self.expansion * out_channels)
      )
    
  def forward(self, x):
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.bn2(self.conv2(out))
    out += self.shortcut(x)
    out = F.relu(out)
    return out    

In [9]:
class ResNet(nn.Module):
  def __init__(self, block, num_blocks, num_classes = 10):
    super(ResNet, self).__init__()
    self.input_channels = 64
    
    self.conv1 = nn.Conv2d(3, 64, kernel_size = 3, stride = 1, padding = 1, bias = False)
    self.bn1 = nn.BatchNorm2d(64)
    self.layer1 = self._make_layer(block, 64, num_blocks[0], stride = 1)
    self.layer2 = self._make_layer(block, 128, num_blocks[1], stride = 2)
    self.layer3 = self._make_layer(block, 256, num_blocks[2], stride = 2)
    self.layer4 = self._make_layer(block, 512, num_blocks[3], stride = 2)
    self.linear = nn.Linear(512 * block.expansion, num_classes)

  def _make_layer(self, block, out_channels, num_blocks, stride):
    strides = [stride] + [1] * (num_blocks - 1)
    layers = []
    for stride in strides:
      layers.append(block(self.input_channels, out_channels, stride))
      self.input_channels = out_channels * block.expansion
    return nn.Sequential(*layers)

  def forward(self, x):
    out = F.relu(self.bn1(self.conv1(x)))
    out = self.layer1(out)
    out = self.layer2(out)
    out = self.layer3(out)
    out = self.layer4(out)
    out = F.avg_pool2d(out, 4)
    out = out.view(out.size(0), -1)
    out = self.linear(out)
    return out



In [10]:
def ResNet18():
  return ResNet(BasicBlock, [2,2,2,2])

In [11]:
net = ResNet18()

In [None]:
num_params = sum(param.numel() for param in net.parameters())
num_params1 = sum(param.numel() for param in net.parameters() if param.requires_grad)
print(num_params)
print(num_params1)

11173962
11173962


In [None]:
from torchsummary import summary
summary(net, (3,32,32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,728
       BatchNorm2d-2           [-1, 64, 32, 32]             128
            Conv2d-3           [-1, 64, 32, 32]          36,864
       BatchNorm2d-4           [-1, 64, 32, 32]             128
            Conv2d-5           [-1, 64, 32, 32]          36,864
       BatchNorm2d-6           [-1, 64, 32, 32]             128
        BasicBlock-7           [-1, 64, 32, 32]               0
            Conv2d-8           [-1, 64, 32, 32]          36,864
       BatchNorm2d-9           [-1, 64, 32, 32]             128
           Conv2d-10           [-1, 64, 32, 32]          36,864
      BatchNorm2d-11           [-1, 64, 32, 32]             128
       BasicBlock-12           [-1, 64, 32, 32]               0
           Conv2d-13          [-1, 128, 16, 16]          73,728
      BatchNorm2d-14          [-1, 128,

In [None]:
print(net)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=

# C1 Train in Pytorch

Create a main function that creates the DataLoaders for the training set and the neural network, then runs 5 epochs with a complete training phase on all the mini-batches of the training set. Write the code as device-agnostic, use the ArgumentParser to be able to read parameters from input, such as the use of Cuda, the data_path, the number of data loader workers, and the optimizer (as string, eg: ‘sgd’).

For each minibatch calculate the training loss value, the top-1 training accuracy of the predictions, measured on training data.



In [None]:
# parse = argparse.ArgumentParser(description='ResNet training CIFAR10')
# args = parse.parse_args()

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [13]:
net = net.to(device)

In [14]:
lr = 1e-1
weight_decay = 5e-4
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9,weight_decay=weight_decay)
#optim.Adam()

## Train the model

In [15]:
epoch = 1
def train(epoch, train_loss_history, train_acc_history, data_loading_time, mini_training_time_total_epoch):
  print('\nEpoch: %d' % epoch)
  net.train()
  train_loss = 0 
  correct = 0
  total = 0
  data_loading_time_total = 0
  mini_training_time = []
  for batch_idx, (inputs, targets) in enumerate(train_loader):
    data_loading_time_start = time.time()
    inputs, targets = inputs.to(device), targets.to(device)
    data_loading_time_end = time.time()
    data_loading_time_total += (data_loading_time_end - data_loading_time_start)

    mini_training_time_start = time.time()
    optimizer.zero_grad()
    outputs = net(inputs)
    loss = loss_fn(outputs, targets)
    loss.backward()
    optimizer.step()
    mini_training_time_end = time.time()
    mini_training_time.append(mini_training_time_end - mini_training_time_start)

    train_loss += loss.item()
    train_loss_history.append(loss.item())
    _, predicted = outputs.max(1)
    total += targets.size(0)
    correct += predicted.eq(targets).sum().item()
    train_acc_history.append(100. * correct / total)
    print("\nThe batch index: {0:d}, len of train loader: {1:d}, Loss: {2:.3f}, acc: {3:.3f}".format(batch_idx,
                                                                                             len(train_loader), 
                                                                                             train_loss / (batch_idx + 1),
                                                                                             100. * correct / total)
          )
  data_loading_time.append(data_loading_time_total)
  mini_training_time_total_epoch.append(mini_training_time)


## Test the model

In [None]:

def test(epoch, test_loss_history, test_acc_history):
  global best_acc
  net.eval()
  test_loss = 0
  correct = 0
  total = 0
  
  with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(test_loader):     
      inputs, targets = inputs.to(device), targets.to(device)

      outputs = net(inputs)
      loss = loss_fn(outputs, targets)

      test_loss += loss.item()
      test_loss_history.append(loss.item())
      _, predicted = outputs.max(1)
      total += targets.size(0)
      correct += predicted.eq(targets).sum().item()
      test_acc_history.append(100. * correct / total)
      print("\nThe batch index: {0}, len of test loader: {1}, Loss: {2:.3f}, acc: {3:.3f}".format(batch_idx,
                                                                                             len(test_loader), 
                                                                                             test_loss / (batch_idx + 1),
                                                                                             100. * correct / total)
          )

In [16]:
test_loss_history = []
test_acc_history = []
train_loss_history = []
train_acc_history = []
total_train_time_epoch = []
data_loading_time = []
mini_training_time_total_epoch = []
for epo in range(epoch):
  train_time_start = time.time()
  train(epo, train_loss_history, train_acc_history, data_loading_time, mini_training_time_total_epoch)
  train_time_end = time.time()
  total_train_time_epoch.append(train_time_end - train_time_start)
  #test(epo, test_loss_history, test_acc_history)


Epoch: 0

The batch index: 0, len of train loader: 391, Loss: 2.427, acc: 12.500

The batch index: 1, len of train loader: 391, Loss: 2.955, acc: 12.891

The batch index: 2, len of train loader: 391, Loss: 3.376, acc: 10.417

The batch index: 3, len of train loader: 391, Loss: 4.181, acc: 11.523

The batch index: 4, len of train loader: 391, Loss: 4.459, acc: 11.719

The batch index: 5, len of train loader: 391, Loss: 4.438, acc: 11.589

The batch index: 6, len of train loader: 391, Loss: 4.319, acc: 11.161

The batch index: 7, len of train loader: 391, Loss: 4.150, acc: 11.816

The batch index: 8, len of train loader: 391, Loss: 4.016, acc: 11.806

The batch index: 9, len of train loader: 391, Loss: 4.041, acc: 11.875

The batch index: 10, len of train loader: 391, Loss: 3.944, acc: 11.790

The batch index: 11, len of train loader: 391, Loss: 3.959, acc: 11.654

The batch index: 12, len of train loader: 391, Loss: 3.922, acc: 11.538

The batch index: 13, len of train loader: 391, Los

In [None]:
print(train_loss_history)
print(train_acc_history)

[2.4798059463500977, 3.0533688068389893, 4.024833679199219, 5.3804097175598145, 5.666123390197754, 5.001592636108398, 2.787261724472046, 3.6739261150360107, 3.472196340560913, 2.9860050678253174, 3.952331781387329, 2.648815631866455, 2.863147497177124, 2.385802984237671, 2.390947103500366, 2.3660616874694824, 3.3050408363342285, 2.376126527786255, 2.4748082160949707, 2.122134208679199, 2.0512964725494385, 2.3324739933013916, 2.1150999069213867, 2.903667449951172, 2.217615842819214, 3.0273563861846924, 2.859140396118164, 2.6627681255340576, 2.840726137161255, 2.283576726913452, 2.308002471923828, 2.8420562744140625, 2.3007850646972656, 2.332951307296753, 2.279984474182129, 2.081688642501831, 1.9397892951965332, 2.002274751663208, 2.0432610511779785, 2.0615530014038086, 2.3094372749328613, 2.429448127746582, 2.491537094116211, 2.0833311080932617, 2.0744853019714355, 2.0577776432037354, 2.0882606506347656, 2.1125378608703613, 1.956144094467163, 1.9901010990142822, 2.0014491081237793, 2.01

# C2 Time measurement of code in C1

Report the running time (by using time.perf_counter() or other timers you find comfortable with) for the following sections of the code:
1. Data-loading time for each epoch
2. Training (i.e., mini-batch calculation) time for each epoch
3. Total running time for each epoch Run 5 epochs.


In [17]:
print(data_loading_time)
print(mini_training_time_total_epoch)
print(total_train_time_epoch)

[0.025910615921020508]
[[9.8288254737854, 9.229211568832397, 9.083222389221191, 9.254616260528564, 9.248799324035645, 8.93960690498352, 9.019183874130249, 8.874441146850586, 8.796023845672607, 8.84339690208435, 8.808263778686523, 8.769039630889893, 8.77456283569336, 8.892950296401978, 9.018012285232544, 9.10795783996582, 9.037927150726318, 8.96465516090393, 8.989676713943481, 8.940425872802734, 8.975197315216064, 9.015435695648193, 9.097066879272461, 9.020205020904541, 9.049197673797607, 9.114433526992798, 9.027028322219849, 8.920456409454346, 8.930718183517456, 8.92421269416809, 9.002710819244385, 8.89394760131836, 9.220908880233765, 9.317677974700928, 9.228999853134155, 9.336296796798706, 9.500483751296997, 9.271661043167114, 9.18738865852356, 9.037784337997437, 8.843547105789185, 8.799776792526245, 8.852999210357666, 8.889435052871704, 8.899832248687744, 9.089813470840454, 9.047771215438843, 8.829323053359985, 9.00752568244934, 8.918273210525513, 8.952221632003784, 10.05688166618347

# C3: I/O optimizaiton starting from code in C2
1. Report the total time spent for the Dataloader varying the number of workers starting from zero and increment the number of workers by 4 (0,4,8,12,16...) until the I/O time doesn’t decrease anymore.
2. Report how many workers are needed for best runtime performance.

In [None]:

def num_worker_time(num_worker_data_loading_time):
  data_loading_time_total = 0
  for batch_idx, (inputs, targets) in enumerate(train_loader):
    data_loading_time_start = time.time()
    inputs, targets = inputs.to(device), targets.to(device)
    data_loading_time_end = time.time()
    data_loading_time_total += (data_loading_time_end - data_loading_time_start)

  num_worker_data_loading_time.append(data_loading_time_total)

In [None]:
num_workers = [0,4,8,12,16]
batch_size = 128
num_worker_data_loading_time = []
# train_loss_history_c3 = []
# train_acc_history_c3 = []
for i in num_workers:
  print("new net: {}".format(i))
  train_loader = torch.utils.data.DataLoader(train_set, batch_size = batch_size,shuffle = True, num_workers = i)
  num_worker_time(num_worker_data_loading_time)
  # train(epo, train_loss_history_c3, train_acc_history_c3)


#test_loader = torch.utils.data.DataLoader(test_set, batch_size = batch_size, shuffle = True, num_workers = 2)
#print(len(train_loader))

new net: 0
new net: 4


  cpuset_checked))


new net: 8


  cpuset_checked))


new net: 12


  cpuset_checked))


new net: 16


  cpuset_checked))


In [None]:
print(num_worker_data_loading_time)

[0.18175649642944336, 0.28144383430480957, 0.3166325092315674, 0.38622260093688965, 0.4294719696044922]


# C4: Profiling starting from code in C3

Compare data-loading and computing time for runs using 1 worker and the number of workers needed for best performance found in C3 and explain (in a few words) the differences if there are any.

https://deeplizard.com/learn/video/kWVgvsejXsE

# C5: Training in GPUs V.S. CPUs

Report the average running time over 5 epochs using the GPU vs using the CPU (using the number of I/O workers found in C3.2)

From C3.2, we got the num_worker is 2

When I run on GPU, for eapoch, the running time is [15.246890783309937, 15.200483560562134, 15.430280685424805, 15.49782395362854, 15.517560005187988], so the average is 15.378607797622681

# C6: Experimenting with different optimizers

Run 5 epochs with the GPU-enabled code and the optimal number of I/O workers. For each epoch, report the average training time, training loss, and top-1 training accuracy using these Optimizers: SGD, SGD with Nesterov, Adagrad, Adadelta, and Adam. Note please use the same default hyper-parameters: learning rate 0.1, weight decay 5e-4, and momentum 0.9 (when it applies) for all these optimizers.

For SGD, the average training time for each epoch is





  [15.246890783309937, 15.200483560562134, 15.430280685424805, 15.49782395362854, 15.517560005187988]. The average training loss for each epoch is [1.988328962069948, 1.5172575635983205, 1.2595373507960679, 1.0342020011314041, 0.8647981308915121]. The average top-1 accuracy for each epoch is [20.432928652711716, 41.66062737103774, 52.627654280262696, 61.37652780733207, 68.19055114819494]

For sgd_nes
[28.066128730773926, 27.760812282562256, 27.782509565353394, 27.850111961364746, 27.89001965522766]

21.97610816966551
2.034897204860092
42.179984230383624
1.4966033151387559
53.260814863356245
1.2289859423856906
62.75977421860316
1.0070519291836282
68.43943285593723
0.8746370840865327

For Adagrad
[27.82266616821289, 27.819855451583862, 27.813787937164307, 27.76784348487854, 27.80567979812622]

20.429481381731158
2.2077978568918564
36.11329664655515
1.654606541709217
45.41344120363851
1.401751711850276
55.15370209333665
1.1763134882273272
61.52070716787579
1.035776418493227

For Adadelta
[28.552342891693115, 28.53111982345581, 28.53568124771118, 28.60840344429016, 28.68533706665039]

38.71571214083069
1.368905180707917
67.03340301211422
0.8550770942817258
75.50370191827194
0.6677756459664201
80.14308876379626
0.5646420402447586
82.27434027954197
0.4976608041302322

For Adam

[15.157033205032349, 14.685014009475708, 14.631178617477417, 14.67825698852539, 14.682486057281494]

22.925146263828232
1.8820709124245607
44.936456226266635
1.387173262093683
55.79855950928615
1.1834112846333047
60.312539668888135
1.0820588461883234
62.93866078717115
1.0340378732632494


In [None]:
'''
|optimizers | epoch |         time         |      train loss     |   train top-1 acc    |
|   SGD     |   1   | 15.246890783309937   |  1.988328962069948  |  20.432928652711716  |
|   SGD     |   2   | 15.200483560562134   |  1.5172575635983205 |  41.66062737103774   |
|   SGD     |   3   | 15.430280685424805   |  1.2595373507960679 |  52.627654280262696  |
|   SGD     |   4   | 15.49782395362854    |  1.0342020011314041 |  61.37652780733207   |
|   SGD     |   5   | 15.517560005187988   |  0.8647981308915121 |  68.19055114819494   |
_________________________________________________________________________________________
| SGD_nes   |   1   | 28.066128730773926   |  2.034897204860092  |  21.97610816966551   |
| SGD_nes   |   2   | 27.760812282562256   |  1.4966033151387559 |  42.179984230383624  |
| SGD_nes   |   3   | 27.782509565353394   |  1.2289859423856906 |  53.260814863356245  |
| SGD_nes   |   4   | 27.850111961364746   |  1.0070519291836282 |  62.75977421860316   |
| SGD_nes   |   5   | 27.89001965522766    |  0.8746370840865327 |  68.43943285593723   |
_________________________________________________________________________________________
| Adagrad   |   1   | 15.246890783309937   |  1.988328962069948  |  20.432928652711716  |
| Adagrad   |   2   | 15.200483560562134   |  1.5172575635983205 |  41.66062737103774   |
| Adagrad   |   3   | 15.430280685424805   |  1.2595373507960679 |  52.627654280262696  |
| Adagrad   |   4   | 15.49782395362854    |  1.0342020011314041 |  61.37652780733207   |
| Adagrad   |   5   | 15.517560005187988   |  0.8647981308915121 |  68.19055114819494   |
_________________________________________________________________________________________
| Adadelta  |   1   | 15.246890783309937   |  1.988328962069948  |  20.432928652711716  |
| Adadelta  |   2   | 15.200483560562134   |  1.5172575635983205 |  41.66062737103774   |
| Adadelta  |   3   | 15.430280685424805   |  1.2595373507960679 |  52.627654280262696  |
| Adadelta  |   4   | 15.49782395362854    |  1.0342020011314041 |  61.37652780733207   |
| Adadelta  |   5   | 15.517560005187988   |  0.8647981308915121 |  68.19055114819494   |
_________________________________________________________________________________________
|   Adam    |   1   | 15.246890783309937   |  1.988328962069948  |  20.432928652711716  |
| Adadelta  |   2   | 15.200483560562134   |  1.5172575635983205 |  41.66062737103774   |
| Adadelta  |   3   | 15.430280685424805   |  1.2595373507960679 |  52.627654280262696  |
| Adadelta  |   4   | 15.49782395362854    |  1.0342020011314041 |  61.37652780733207   |
| Adadelta  |   5   | 15.517560005187988   |  0.8647981308915121 |  68.19055114819494   |
'''

# C7: Experimenting without Batch Norm layer
With the GPU-enabled code and the optimal number of workers, report the average training loss, top-1 training accuracy for 5 epochs with the default SGD optimizer and its hyper-parameters but without batch norm layers.

In [3]:
import numpy as np
a = [15.246890783309937, 15.200483560562134, 15.430280685424805, 15.49782395362854, 15.517560005187988]
print(np.mean(a))

15.378607797622681


In [8]:
a = [1,2,3,4,5,6]
for i in range(0, 6, 2):
  print(a[i:i+2])



[1, 2]
[3, 4]
[5, 6]
