<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/DenseNet_121_CIFAR10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import time
from collections import OrderedDict

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Subset
import torch.utils.checkpoint as cp

from torchvision import datasets
from torchvision import transforms

if torch.cuda.is_available():
  torch.backends.cudnn.deterministic = True

In [2]:
###################
# Model Settings
###################

# Hyperparameters
RANDOM_SEED = 1
LEARNING_RATE = 0.001
BATCH_SIZE = 128
NUM_EPOCHS = 20

# Architecture
NUM_CLASSES = 10

# Other
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
grayscale = False

In [3]:
####################
# CIFAR-10 Dataset
####################

# training samples ~ 48000 samples
train_indices = torch.arange(0, 48000)
# validation samples ~ 2000 samples
valid_indices = torch.arange(48000, 50000)

train_and_valid = datasets.CIFAR10(root='data',
                                   train=True,
                                   transform=transforms.ToTensor(),
                                   download=True)

train_dataset = Subset(train_and_valid, train_indices)
valid_dataset = Subset(train_and_valid, valid_indices)
test_dataset = datasets.CIFAR10(root="data",
                                train=False,
                                transform=transforms.ToTensor(),
                                download=False)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          num_workers=4,
                          shuffle=True)

valid_loader = DataLoader(dataset=valid_dataset,
                          batch_size=BATCH_SIZE,
                          num_workers=4,
                          shuffle=False)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         num_workers=4,
                         shuffle=False)

Files already downloaded and verified


In [4]:
# Sanity check the loaders
torch.manual_seed(0)

for epoch in range(2):
  for batch_idx, (features, label) in enumerate(train_loader):

    print("Epoch: %d | Batch: %d | Batch Size: %d" % (epoch+1, batch_idx, label.size()[0]))

    features = features.to(device)
    label = label.to(device)
    break

Epoch: 1 | Batch: 0 | Batch Size: 128
Epoch: 2 | Batch: 0 | Batch Size: 128


In [5]:
# Sanity check shuffling
# label indices should be in different order
# label order should be different in second epoch

for images, labels in train_loader:
  pass
print(labels[:10])

for images, labels in train_loader:
  pass
print(labels[:10])

tensor([3, 0, 1, 3, 3, 5, 0, 4, 9, 4])
tensor([1, 0, 4, 1, 8, 2, 0, 3, 5, 3])


In [6]:
# Sanity check ~ validation and test set should be diverse
# ~ should contain all classes

for images, labels in valid_loader:
  pass
print(labels[:10])

for images, labels in test_loader:
  pass
print(labels[:10])

tensor([5, 0, 3, 6, 8, 7, 9, 5, 6, 6])
tensor([7, 5, 8, 0, 8, 2, 7, 0, 3, 5])


In [7]:
#######################
# Model
######################

def _bn_function_factory(norm, relu, conv):
  def bn_function(*inputs):
    # TODO Add dimension changes
    concated_features = torch.cat(inputs, 1)
    bottleneck_output = conv(relu(norm(concated_features)))
    return bottleneck_output
  return bn_function

class _DenseLayer(nn.Sequential):
  def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient=False):
    super(_DenseLayer, self).__init__()
    self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
    self.add_module('relu1', nn.ReLU(inplace=True)),
    self.add_module('conv1', nn.Conv2d(in_channels=num_input_features,
                                       out_channels=bn_size * growth_rate,
                                       kernel_size=1,
                                       stride=1,
                                       bias=False)),
    self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
    self.add_module('relu2', nn.ReLU(inplace=True)),
    self.add_module('conv2', nn.Conv2d(in_channels=bn_size * growth_rate,
                                       out_channels=growth_rate,
                                       kernel_size=3,
                                       stride=1,
                                       padding=1,
                                       bias=False)),
    self.drop_rate = drop_rate
    self.memory_efficient = memory_efficient
  
  def forward(self, *prev_features):
    # TODO Add dimensions
    bn_function = _bn_function_factory(self.norm1, self.relu1, self.conv1)
    if self.memory_efficient and any(prev_feature.requires_grad for prev_feature in prev_efatures):
      bottleneck_output = cp.checkpoint(bn_function, *prev_features)
    else:
      bottleneck_output = bn_function(*prev_features)
    new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
    if self.drop_rate > 0:
      new_features = F.dropout(new_features, p=self.drop_rate,
                               training=self.training)
    return new_features

In [8]:
class _DenseBlock(nn.Module):
  def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, memory_efficient=False):
    super(_DenseBlock, self).__init__()
    # num_layers = 6 (i = 0)
    # num_layers = 12 (i = 1)
    # num_layers = 24 (i = 2)
    # num_layers = 16 (i = 3)
    for i in range(num_layers):
      layer = _DenseLayer(
          num_input_features + i * growth_rate,
          growth_rate=growth_rate,
          bn_size=bn_size,
          drop_rate=drop_rate,
          memory_efficient=memory_efficient
      )
      self.add_module('denselayer%d' % (i + 1), layer)
  
  def forward(self, init_features):
    # TODO Add dimension changes
    features = [init_features]
    for name, layer in self.named_children():
      new_features = layer(*features)
      features.append(new_features)
    return torch.cat(features, 1)

In [9]:
class _Transition(nn.Sequential):
  def __init__(self, num_input_features, num_output_features):
    super(_Transition, self).__init__()
    self.add_module('norm', nn.BatchNorm2d(num_input_features))
    self.add_module('relu', nn.ReLU(inplace=True))
    self.add_module('conv', nn.Conv2d(in_channels=num_input_features,
                                      out_channels=num_output_features,
                                      kernel_size=1,
                                      stride=1,
                                      bias=False))
    self.add_module('pool', nn.AvgPool2d(kernel_size=2,
                                         stride=2))

In [10]:
class DenseNet121(nn.Module):
  def __init__(self, 
               growth_rate=32, 
               block_config=(6, 12, 24, 16),
               num_init_featuremaps=64,
               bn_size=4,
               drop_rate=0,
               num_classes=1000,
               memory_efficient=False,
               grayscale=False,
               ):
    super(DenseNet121, self).__init__()

    # First Convolution
    if grayscale:
      in_channels = 1
    else:
      in_channels = 3
    
    self.features = nn.Sequential(OrderedDict([
                                               ('conv0', nn.Conv2d(in_channels=in_channels,
                                                                  out_channels=num_init_featuremaps,
                                                                  kernel_size=7,
                                                                  stride=2,
                                                                  padding=3,
                                                                  bias=False)),
                                               ('norm0', nn.BatchNorm2d(num_features=num_init_featuremaps)),
                                               ('relu0', nn.ReLU(inplace=True)),
                                               ('pool0', nn.MaxPool2d(kernel_size=3,
                                                                      stride=2,
                                                                      padding=1)),
    ]))

    # Each DenseBlock
    num_features = num_init_featuremaps
    
    # num_layers = 0 (6 _DenseBlock)
    # num_layers = 1 (12 _DenseBlock)
    # num_layers = 2 (24 _DenseBlock)
    # num_layers = 3 (16, _DenseBlock)
    for i, num_layers in enumerate(block_config):
      block = _DenseBlock(
          num_layers=num_layers,
          num_input_features=num_features,
          bn_size=bn_size,
          growth_rate=growth_rate,
          drop_rate=drop_rate,
          memory_efficient=memory_efficient
      )
      self.features.add_module('denseblock%d' % (i + 1), block)
      num_features = num_features + num_layers * growth_rate
      if i != len(block_config) - 1:
        trans = _Transition(num_input_features=num_features,
                            num_output_features=num_features // 2)
        self.features.add_module('transition%d' % (i + 1), trans)
        num_features = num_features // 2

    # Final BatchNorm
    self.features.add_module('norm5', nn.BatchNorm2d(num_features))

    # Linear Layer
    self.classifier = nn.Linear(num_features, num_classes)

    # Official init from torch repo
    for m in self.modules():
      if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight)
      elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)
      elif isinstance(m, nn.Linear):
        nn.init.constant_(m.bias, 0)
  
  def forward(self, x):
    features = self.features(x)
    out = F.relu(features, inplace=True)
    out = F.adaptive_avg_pool2d(out, (1, 1))
    out = torch.flatten(out, 1)
    logits = self.classifier(out)
    probas = F.softmax(logits, dim=1)
    return logits, probas

In [11]:
torch.manual_seed(RANDOM_SEED)
model = DenseNet121(num_classes=NUM_CLASSES, grayscale=grayscale)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [12]:
# Training
def compute_accuracy(model, data_loader, device):
  correct_pred, num_examples = 0., 0
  model.eval()
  for i, (features, targets) in enumerate(data_loader):

    features = features.to(device)
    targets = targets.to(device)

    logits, probas = model(features)
    _, predicted_labels = torch.max(probas, 1)
    num_examples += targets.size(0)
    assert predicted_labels.size() == targets.size()

    correct_pred += (predicted_labels==targets).sum()
  return correct_pred / num_examples * 100

In [13]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
  model.train()
  
  for batch_idx, (features, targets) in enumerate(train_loader):

    features = features.to(device)
    targets = targets.to(device)

    # FORWARD & BACKPROP
    logits, probas = model(features)
    cost = F.cross_entropy(logits, targets)
    optimizer.zero_grad()

    cost.backward()

    # UPDATE MODEL PARAMETERS
    optimizer.step()

    if batch_idx % 150 == 0:
      print("Batch: %04d/%04d || Epoch: %04d/%04d || Cost: %.3f" % (batch_idx, len(train_loader), epoch+1, NUM_EPOCHS, cost.item()))
    
  model.eval()
  with torch.set_grad_enabled(False):
    train_acc = compute_accuracy(model, train_loader, device)
    valid_acc = compute_accuracy(model, valid_loader, device)

    print("Train Accuracy: %.2f" % (train_acc))
    print("Valid Accuracy: %.2f" % (valid_acc))
  elapsed_time = (time.time() - start_time) / 60
  print("Epoch Elapsed Time: ", elapsed_time, " mins")
elapsed_time = (time.time() - start_time) / 60
print("Total Training Time: ", elapsed_time, " mins")

Batch: 0000/0375 || Epoch: 0001/0020 || Cost: 2.386
Batch: 0150/0375 || Epoch: 0001/0020 || Cost: 1.471
Batch: 0300/0375 || Epoch: 0001/0020 || Cost: 1.316
Train Accuracy: 49.79
Valid Accuracy: 49.45
Epoch Elapsed Time:  0.944011918703715  mins
Batch: 0000/0375 || Epoch: 0002/0020 || Cost: 1.045
Batch: 0150/0375 || Epoch: 0002/0020 || Cost: 1.016
Batch: 0300/0375 || Epoch: 0002/0020 || Cost: 0.720
Train Accuracy: 62.37
Valid Accuracy: 58.95
Epoch Elapsed Time:  1.890422777334849  mins
Batch: 0000/0375 || Epoch: 0003/0020 || Cost: 0.809
Batch: 0150/0375 || Epoch: 0003/0020 || Cost: 0.685
Batch: 0300/0375 || Epoch: 0003/0020 || Cost: 0.971
Train Accuracy: 70.76
Valid Accuracy: 66.35
Epoch Elapsed Time:  2.8344410101572675  mins
Batch: 0000/0375 || Epoch: 0004/0020 || Cost: 0.619
Batch: 0150/0375 || Epoch: 0004/0020 || Cost: 0.778
Batch: 0300/0375 || Epoch: 0004/0020 || Cost: 0.461
Train Accuracy: 73.00
Valid Accuracy: 67.55
Epoch Elapsed Time:  3.772270361582438  mins
Batch: 0000/0375 ||

In [17]:
# Testing Set
model.eval()
with torch.set_grad_enabled(False):
  test_acc = compute_accuracy(model, test_loader, device)
  print("Test Accuracy: %.2f" % (test_acc.item()))

Test Accuracy: 74.78
