In [1]:
import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import time
from typing import Type, Any, Callable, Union, List, Optional
from torch import Tensor 
if torch.cuda.is_available():
    print("Using GPUs")
    device = torch.device("cuda") 
else:
    device = torch.device("cpu")

Using GPUs


In [2]:
torch.manual_seed(43)
batch_size = 128

### for CIFAR 10
# stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
## for CIFAR 100
stats = ((0.507, 0.487, 0.441), (0.267, 0.256, 0.276))

transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(*stats),
    torchvision.transforms.RandomCrop(32, padding=4, padding_mode='constant'),
    torchvision.transforms.RandomHorizontalFlip(p=0.5)
])

train_set = torchvision.datasets.CIFAR100(root="data", train=True, download=True, transform=transform)
train_size = len(train_set)
test_set = torchvision.datasets.CIFAR100(root="data", train=False, download=True, transform=transform)
test_set, validation_set = torch.utils.data.random_split(test_set, [5000, 5000])
test_size = len(test_set)
validation_size = len(validation_set)


train_loader = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size, num_workers=4, pin_memory=True)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size, num_workers=4, pin_memory=True)

data_loaders = {"train": train_loader, "test": test_loader, "validation": validation_loader}
dataset_sizes = {"train": train_size, "test": test_size, "validation": validation_size}
print(dataset_sizes)

Files already downloaded and verified
Files already downloaded and verified
{'train': 50000, 'test': 5000, 'validation': 5000}


In [28]:
# from https://pytorch.org/hub/pytorch_vision_resnet/
class BasicBlock(nn.Module):
    """Basic DSNet. Given input [in_channels, height, width], 
    - First pass through Conv2d(in_channels, in_channels) + BatchNorm + ReLU 
        -> Output dimensions: [in_channels, height, width] (1)
    - Then, add with the ("normalization and channel-wise weight")(input)
        -> Output dimensions: [in_channels, height width] (2)
    - Pass through another Conv2d(outchannels, outchannels) + BN + ReLU
        -> Output dimensions: [in_channels, height, width] (3)
    - Add again with ("normalized + channel-wise weight")(1) and ("normalized + channel-wise weight")(2)
        -> Output dimensions: [in_channels, height, width]
    Caveat: The normalization and channel-wise weight is not shared.
    Attributes:
        in_planes: # of Input channels
        n_models: Number of layers. Have to specify here as we need to connect all the layers
    """

    def __init__(self, inplanes, n_models, device=torch.device("cpu")):
        super().__init__()
        print("in block ", inplanes)
        self.layers = nn.ModuleList([])
        self.channel_wise_w_list = []  # Result is list of list of weights at each steps
        self.norm_layers = []
        for i in range(n_models):
            self.layers.append(nn.Sequential(
                nn.Conv2d(inplanes, inplanes, kernel_size=3, padding=1),
                nn.BatchNorm2d(inplanes),
                nn.ReLU(inplace=True),
                nn.Conv2d(inplanes, inplanes, kernel_size=3, padding=1),
                nn.BatchNorm2d(inplanes)
            ))

            self.norm_layers.append(
                [nn.GroupNorm(num_groups=4, num_channels=inplanes).to(device) for _ in range(i+1)]
            )


            # One variable for each channel for each time, [[w00], [w10, w11], [w20, w21, w22], ...]
            self.channel_wise_w_list.append(
                [torch.autograd.Variable(torch.randn(1, inplanes, 1, 1).to(device), requires_grad=True)
                 ##nn.Parameter(torch.)
                 for _ in range(i+1)]
            )

        self.relu = nn.ReLU(inplace=True)

    def forward(self, x: Tensor) -> Tensor:
        # Consisting of output of each layer.
        
        print("---- in forward ")
        print("len norm: ", len(self.norm_layers))
        print("len weight: ", len(self.channel_wise_w_list))
        print("len conv: ", len(self.layers))
        outputs = [x]
        for (layer, ch_ws, norm_layer) in zip(self.layers, self.channel_wise_w_list, self.norm_layers):
            output = layer(outputs[-1])

            assert len(outputs) == len(ch_ws), "Length not equal"
            dense_normalized_inputs = [norm(x) * ch_weight
                                       for output, ch_weight, norm in zip(outputs, ch_ws, norm_layer)]
            for dense_normalized_input in dense_normalized_inputs:
                output += dense_normalized_input

            output = self.relu(output)
            outputs.append(output)

        return outputs[-1]


class TransitionBlock(nn.Module):
    """A transition block to reduce channels of [input, w, h] to [outplanes, w//2, h//2]
    Attributes:
        inplanes: # of Input channels
        outplanes: # of Output channels
    """

    def __init__(self, inplanes, outplanes):
        super().__init__()

        self.conv1 = nn.Conv2d(inplanes, outplanes, stride=2, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(outplanes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(outplanes, outplanes, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(outplanes)

        self.block = nn.Sequential(
            self.conv1, self.bn1, self.relu, self.conv2, self.bn2)

        self.downsample = nn.Conv2d(
            inplanes, outplanes, kernel_size=1, stride=2)

    def forward(self, x: Tensor) -> Tensor:
        identity = x
        
        if self.downsample is not None:
            identity = self.downsample(x)

        out = self.block(x) + identity
        return self.relu(out)


class DSNet(nn.Module):
    """Defining the whole model. 
    In high level: 
        - Input -> [batch, 3, height, width]
        - Beginning Layer -> [batch, 3, height, width]
        - First Block: n*BasicBlock(16) -> [batch, 16, height, width]
        - Transition: TransitionBlock(16, 32) -> [batch, 32, height, width]
        - Second Block: n*BasicBlock(32) -> [batch, 32, height, width]
        - Transition: TransitionBlock(32, 64) -> [batch, 32, height, width]
        - Third Block: n*BasicBlock(64) -> [batch, 64, height, width]
        - FinalLayer: AdaptiveAvgPool2d + Linear(64, num_classes)
    Attributes:
        model_n: # of layers, based on CIFAR-ResNet 
        num_classes: Number of classes
        device: needed for GPU vs CPU.
    """

    def __init__(self, model_n, num_classes: int = 10, device=torch.device("cpu")):
        super().__init__()

        self.residual_layers = nn.ModuleList([])
        self.model_n = model_n
        self.device = device

        # begining layers
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)

        # ResNet blocks [16, 32, 64]
        # first block, 16 channels
        self.residual_layers.append(BasicBlock(
            16, self.model_n, device).to(device))
        self.residual_layers.append(TransitionBlock(16, 32).to(device))

        # second block, 32 channels
        self.residual_layers.append(BasicBlock(
            32, self.model_n, device).to(device))
        self.residual_layers.append(TransitionBlock(32, 64).to(device))

        # third block, 64 channels
        self.residual_layers.append(BasicBlock(
            64, self.model_n, device).to(device))
        self.residual_layers.append(TransitionBlock(64, 64).to(device))

        # output layers
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x: Tensor) -> Tensor:
        # begining layers
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        # DSNet blocks
        for i, layer in enumerate(self.residual_layers):
            x = layer(x)

        # output layers
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

In [29]:
#### Train Configurations, based on DSNet and ResNet paper
model_n = 3
milestones = [90, 135]
momentum = 0.9
weight_decay = 0.0005
gamma = 0.1
lr = 0.1
epochs = 100 ### should be 180

model = DSNet(model_n, num_classes=100, device=device)
model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma)


in block  16
 i  0
len norm:  1
len weight:  1
 i  1
len norm:  2
len weight:  2
 i  2
len norm:  3
len weight:  3
in block  32
 i  0
len norm:  1
len weight:  1
 i  1
len norm:  2
len weight:  2
 i  2
len norm:  3
len weight:  3
in block  64
 i  0
len norm:  1
len weight:  1
 i  1
len norm:  2
len weight:  2
 i  2
len norm:  3
len weight:  3


In [30]:
### Train loop + validation/ also test at the end
print("Configuration: ", "model:DSNet(small)", " model_n:", model_n, " batch size:", batch_size, 
      " optimizer:SGD", " lr:", lr, " epochs:", epochs)

print("----------------------------- Train --------------------------------")
for epoch in range(epochs):
    start_time = time.time()
    print("Epoch {}/{}".format(epoch+1, epochs))
    print("-" * 30)
    
    
    epoch_loss = {"train": 0.0, "validation": 0.0}
    epoch_acc = {"train": 0.0, "validation": 0.0}
    
    running_loss = {"train": 0.0, "validation": 0.0}
    running_corrects = {"train": 0, "validation": 0}
    
    for phase in ["train", "validation"]:
        if phase == "train":
            model.train(True)
        else:
            model.train(False)
        
        for data in data_loaders[phase]:
            inputs, labels = data 
            
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad() # clear all gradients
            
            outputs = model(inputs) # batch_size x num_classes
            _, preds = torch.max(outputs.data, 1) # values, indices
            loss = loss_fn(outputs, labels)
            
            if phase == "train":
                loss.backward()  # compute gradients
                optimizer.step() # update weights/biases
               
            running_loss[phase] += loss.data.item() * inputs.size(0)
            running_corrects[phase] += torch.sum(preds == labels.data).item()
        
        epoch_loss[phase] = running_loss[phase] / dataset_sizes[phase]
        epoch_acc[phase] =  running_corrects[phase] / dataset_sizes[phase]

    # Visualize the loss and accuracy values.
    print({
        'time': np.round(time.time()-start_time, 5),
        'train_loss': np.round(epoch_loss["train"], 5),
        'train_acc': np.round(epoch_acc["train"], 5),
        'val_loss': np.round(epoch_loss["validation"], 5),
        'val_acc': np.round(epoch_acc["validation"], 5),
    })
    
    scheduler.step()

    
    
### evaluating the model with test set
print("----------------------------- Test --------------------------------")
with torch.no_grad():
    model.eval()
    running_loss = 0
    running_corrects = 0

    for data in test_loader:
        inputs, labels = data 

        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad() # clear all gradients

        outputs = model(inputs) # batch_size x num_classes
        _, preds = torch.max(outputs.data, 1) # values, indices
        loss = loss_fn(outputs, labels)

        running_loss += loss.data.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data).item()

    # Visualize the loss and accuracy values.
    print({
    'time': np.round(time.time()-start_time, 5),
    'test_loss': np.round(running_loss/ dataset_sizes['test'], 5),
    'test_acc': np.round(running_corrects/ dataset_sizes['test'], 5),
    })

Configuration:  model:DSNet(small)  model_n: 3  batch size: 128  optimizer:SGD  lr: 0.1  epochs: 100
----------------------------- Train --------------------------------
Epoch 1/100
------------------------------
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len weight:  3
len conv:  3
---- in forward 
len norm:  3
len

KeyboardInterrupt: 

In [22]:
from torchsummary import summary
summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]             448
       BatchNorm2d-2           [-1, 16, 32, 32]              32
              ReLU-3           [-1, 16, 32, 32]               0
            Conv2d-4           [-1, 16, 32, 32]           2,320
       BatchNorm2d-5           [-1, 16, 32, 32]              32
              ReLU-6           [-1, 16, 32, 32]               0
            Conv2d-7           [-1, 16, 32, 32]           2,320
       BatchNorm2d-8           [-1, 16, 32, 32]              32
              ReLU-9           [-1, 16, 32, 32]               0
           Conv2d-10           [-1, 16, 32, 32]           2,320
      BatchNorm2d-11           [-1, 16, 32, 32]              32
             ReLU-12           [-1, 16, 32, 32]               0
           Conv2d-13           [-1, 16, 32, 32]           2,320
      BatchNorm2d-14           [-1, 16,