Prepare the data:

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
from torch.nn import Parameter
from torch.nn.modules.module import Module
from torchsummary import summary

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.is_available()

cuda:0


True

In [3]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=4)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=4)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [4]:

class PruningModule(Module):
    def prune_by_percentile(self, q=5.0, **kwargs):
        """
        Note:
             The pruning percentile is based on all layer's parameters concatenated
        Args:
            q (float): percentile in float
            **kwargs: may contain `cuda`
        """
        # Calculate percentile value
        alive_parameters = []
        for name, p in self.named_parameters():
            # We do not prune bias term
            if 'bias' in name or 'mask' in name:
                continue
            tensor = p.data.cpu().numpy()
            alive = tensor[np.nonzero(tensor)] # flattened array of nonzero values
            alive_parameters.append(alive)

        all_alives = np.concatenate(alive_parameters)
        percentile_value = np.percentile(abs(all_alives), q)
        print(f'Pruning with threshold : {percentile_value}')

        # Prune the weights and mask
        # Note that module here is the layer
        # ex) fc1, fc2, fc3
        for name, module in self.named_modules():
            if name in ['fc1', 'fc2','fc3']:
                module.prune(threshold=percentile_value)

    def prune_by_std(self, s=0.25):
        """
        Note that `s` is a quality parameter / sensitivity value according to the paper.
        According to Song Han's previous paper (Learning both Weights and Connections for Efficient Neural Networks),
        'The pruning threshold is chosen as a quality parameter multiplied by the standard deviation of a layer’s weights'

        I tried multiple values and empirically, 0.25 matches the paper's compression rate and number of parameters.
        Note : In the paper, the authors used different sensitivity values for different layers.
        """
        for name, module in self.named_modules():
            if name in ['fc1', 'fc2','fc3']:
                threshold = np.std(module.weight.data.cpu().numpy()) * s
                print(f'Pruning with threshold : {threshold} for layer {name}')
                module.prune(threshold)


class MaskedLinear(Module):
    """Applies a masked linear transformation to the incoming data: :math:`y = (A * M)x + b`

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to False, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(N, *, in\_features)` where `*` means any number of
          additional dimensions
        - Output: :math:`(N, *, out\_features)` where all but the last dimension
          are the same shape as the input.

    Attributes:
        weight: the learnable weights of the module of shape
            (out_features x in_features)
        bias:   the learnable bias of the module of shape (out_features)
        mask: the unlearnable mask for the weight.
            It has the same shape as weight (out_features x in_features)

    """
    def __init__(self, in_features, out_features, bias=True):
        super(MaskedLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        # Initialize the mask with 1
        self.mask = Parameter(torch.ones([out_features, in_features]), requires_grad=False)
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input):
        return F.linear(input, self.weight * self.mask, self.bias)

    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'in_features=' + str(self.in_features) \
            + ', out_features=' + str(self.out_features) \
            + ', bias=' + str(self.bias is not None) + ')'

    def prune(self, threshold):
        weight_dev = self.weight.device
        mask_dev = self.mask.device
        # Convert Tensors to numpy and calculate
        tensor = self.weight.data.cpu().numpy()
        mask = self.mask.data.cpu().numpy()
        new_mask = np.where(abs(tensor) < threshold, 0.0, mask)
        # Apply new weight and mask
        self.weight.data = torch.from_numpy(tensor * new_mask).to(weight_dev)
        self.mask.data = torch.from_numpy(new_mask).to(mask_dev)


Define the network:

In [5]:
class Net(PruningModule):
    def __init__(self):
        super(Net, self).__init__()
        linear = MaskedLinear
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = linear(16 * 5 * 5, 120)
        self.fc2 = linear(120, 84)
        self.fc3 = linear(84, 10)


    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
net.to(device)
summary(net, (3, 32, 32))
# Freeze the first layer
#for param in net.fc1.parameters():
#    param.requires_grad = False

# Initialize the first layer
#def weights_init(m):
#    if isinstance(m, nn.Linear):
#        m.weight.data.normal_(0, 0.01)
    
#net.apply(weights_init)

#net.prune_by_std()

#print(net)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 6, 28, 28]             456
         MaxPool2d-2            [-1, 6, 14, 14]               0
            Conv2d-3           [-1, 16, 10, 10]           2,416
         MaxPool2d-4             [-1, 16, 5, 5]               0
      MaskedLinear-5                  [-1, 120]          48,120
      MaskedLinear-6                   [-1, 84]          10,164
      MaskedLinear-7                   [-1, 10]             850
Total params: 62,006
Trainable params: 62,006
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.06
Params size (MB): 0.24
Estimated Total Size (MB): 0.31
----------------------------------------------------------------


params[4] are weights in the first layer, params[5] are the masks, etc

In [6]:
params = list(net.parameters())
print(len(params))
print(params[4])

13
Parameter containing:
tensor([[-0.0214, -0.0039, -0.0270,  ..., -0.0166, -0.0090, -0.0218],
        [-0.0334, -0.0108, -0.0015,  ..., -0.0237,  0.0226, -0.0432],
        [ 0.0433, -0.0383,  0.0144,  ..., -0.0284, -0.0414,  0.0333],
        ...,
        [ 0.0197, -0.0304, -0.0278,  ..., -0.0384,  0.0454,  0.0064],
        [-0.0260, -0.0139, -0.0254,  ...,  0.0478,  0.0123,  0.0180],
        [ 0.0257, -0.0456, -0.0299,  ..., -0.0418,  0.0291,  0.0163]],
       device='cuda:0', requires_grad=True)


In [7]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss() # Softmax is built in it so you do not need add that on the last layer
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
initial_optimizer_state_dict = optimizer.state_dict()

In [8]:
def train(epochs):
    for epoch in range(epochs):  # loop over the dataset multiple times
        
        train_correct = 0
        train_total = 0
        running_loss = 0.0
        
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)
    
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 200 == 199:    # print every 1000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 200))
                running_loss = 0.0
                
            # training accuracy
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            
        print('Train Accuracy: %.2f %%' % (100 * train_correct / train_total))
        
    print('Finished Training')

In [9]:
def test():
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print('Accuracy of the network on the 10000 test images: %.2f %%' % (
        100 * correct / total))

In [10]:
train(10)
test()

[1,   200] loss: 2.303
[1,   400] loss: 2.301
[1,   600] loss: 2.300
Train Accuracy: 10.57 %
[2,   200] loss: 2.289
[2,   400] loss: 2.273
[2,   600] loss: 2.229
Train Accuracy: 19.07 %
[3,   200] loss: 2.026
[3,   400] loss: 1.976
[3,   600] loss: 1.929
Train Accuracy: 28.65 %
[4,   200] loss: 1.854
[4,   400] loss: 1.804
[4,   600] loss: 1.751
Train Accuracy: 34.87 %
[5,   200] loss: 1.670
[5,   400] loss: 1.642
[5,   600] loss: 1.620
Train Accuracy: 40.77 %
[6,   200] loss: 1.573
[6,   400] loss: 1.554
[6,   600] loss: 1.525
Train Accuracy: 43.98 %
[7,   200] loss: 1.511
[7,   400] loss: 1.480
[7,   600] loss: 1.475
Train Accuracy: 46.65 %
[8,   200] loss: 1.437
[8,   400] loss: 1.426
[8,   600] loss: 1.406
Train Accuracy: 48.47 %
[9,   200] loss: 1.393
[9,   400] loss: 1.378
[9,   600] loss: 1.373
Train Accuracy: 50.43 %
[10,   200] loss: 1.334
[10,   400] loss: 1.352
[10,   600] loss: 1.320
Train Accuracy: 52.23 %
Finished Training
Accuracy of the network on the 10000 test images:

In [12]:
net.prune_by_std()
test()
summary(net, (3, 32, 32))

Pruning with threshold : 0.007882364094257355 for layer fc1
Pruning with threshold : 0.014518674463033676 for layer fc2
Pruning with threshold : 0.02726072259247303 for layer fc3
Accuracy of the network on the 10000 test images: 52.56 %
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 6, 28, 28]             456
         MaxPool2d-2            [-1, 6, 14, 14]               0
            Conv2d-3           [-1, 16, 10, 10]           2,416
         MaxPool2d-4             [-1, 16, 5, 5]               0
      MaskedLinear-5                  [-1, 120]          48,120
      MaskedLinear-6                   [-1, 84]          10,164
      MaskedLinear-7                   [-1, 10]             850
Total params: 62,006
Trainable params: 62,006
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (M

In [None]:
optimizer.load_state_dict(initial_optimizer_state_dict) 
train(10)
#print(params[4])
test()