# Network compression using Singular Value Decomposition on weights
In this jupyter notebook, I have tried to compress weights of Deep Neural Network of 5-Layers for MNIST dataset problem. I have used Singular Value Decomposition on the weights of each layer by selecting top 10 and 20 features in Singular matrix.

With normal 5-Layer Deep Neural Network (Fully connected layers) the test accuracies on MNIST data was 98.5%

With top 20 features of Singular matrix values of trained weights, I was able to get 95% accuracy on test data for trained model.

Training the model again with top 20 features of Singular matrix of trained weights, I was able to get accuracy of around 98%. And the training time was smaller compared to original training, as the weights were reduced.

In [1]:
from __future__ import print_function
from time import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.nn import Parameter
import argparse

In [2]:
parser = argparse.ArgumentParser(description='PyTorch Example')
parser.add_argument('--disable-cuda', action='store_true',
                    help='Disable CUDA')
parser.add_argument('--interval',metavar='N',default=1000)
args = parser.parse_args(args=[])
args.cuda = not args.disable_cuda and torch.cuda.is_available()
#Is cuda is present?
print(args.cuda)
#Total number of GPU available
print(torch.cuda.device_count())


True
1


In [3]:
#Load MNiSt data
train_loader = torch.utils.data.DataLoader(datasets.MNIST('/opt/e533/MNIST',
                    train=True,
                    download=False,
                    transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307,), (0.3081,))
                    ])),
                    batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('/opt/e533/MNIST',
        train=False,
        transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=128, shuffle=True)

In [4]:
#Baseline Model for training 

class Model(torch.nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, 1024)
        self.fc6 = nn.Linear(1024, 10)
        
    def forward(self,x):
        x=F.relu(self.fc1(x))
        x=F.relu(self.fc2(x))
        x=F.relu(self.fc3(x))
        x=F.relu(self.fc4(x))
        x=F.relu(self.fc5(x))
        x=self.fc6(x)
        return F.log_softmax(x,dim=1)
    
    


In [5]:
model = Model()

if torch.cuda.is_available():
    model=model.cuda()
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

In [6]:
def train(epoch):
    
    for batch_idx,(data,target) in enumerate(train_loader):
        data=data.view(-1,28*28)
#         print(data.shape)
        if torch.cuda.is_available():
            data,target = data.cuda(),target.cuda()
        data,target = Variable(data),Variable(target)
        # Clears the gradients of all optimized Variables
        
        optimizer.zero_grad()
        y_pred = model(data)
        loss = criterion(y_pred, target)
        loss.backward()
        optimizer.step()

        if batch_idx % args.interval  == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        epoch, batch_idx * len(data), len(train_loader.dataset),
        100. * batch_idx / len(train_loader), loss.data[0]))

### Uncomment model load and comment the training for loop, to save time from training data


In [7]:
#Load already available model
# model.load_state_dict(torch.load("optimum_wts"))

#Uncomment and comment above code the followint code to get new weights 

for t in range(53):
    train(t) 



In [8]:
def baseline_test():
    
    model.eval()
    test_loss = 0
    correct = 0
    

    for i,(data,target) in enumerate(test_loader):
        data = data.view(-1,28*28)
        
        if torch.cuda.is_available():
            data,target = data.cuda(),target.cuda()
        data,target = Variable(data),Variable(target)
        
        y_pred = model(data)
        
        
        test_loss += criterion(y_pred, target).data[0]
        pred = y_pred.data.max(1)[1]
        correct += pred.eq(target.data).sum()
    
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.001f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * float(correct) / len(test_loader.dataset)))
   

In [9]:
baseline_test()


Test set: Average loss: 0.0009, Accuracy: 9853/10000 (98.5%)



### The following are the accuracies of different D
##### For D = 10 features the test accuracies vary from 67.9 % to 74.5 % 
##### For D = 20, the test accuracies vary from  94 % to 95 %
##### For D = 30,40 the test accuracies vary from 97 % to 97.6%
##### For D = 50, the test accuracies is 98.1%
#### For D=1024, the test accuracies is 98.5% (base line accuracies)

In [38]:
class Test_Model(torch.nn.Module):
    
    def __init__(self,model):
        super(Test_Model, self).__init__()
        self.D = 20
        
        #weight1 and bias
        params=model.state_dict()
        
        self.weight1 = Variable(params["fc1.weight"].t())
        self.bias1 = Variable(params["fc1.bias"])
        

        
        u,s,v = torch.svd(self.weight1)
        u,s,v = self.reshape_mat(u,s,v)
        
        self.weight1_hat,self.v1 = self.new_weights(u,s,v)
        self.v1=Parameter(self.v1,requires_grad=True)
        self.weight1_hat=Parameter(self.weight1_hat,requires_grad=True)
        
        #weight2
        self.weight2 = Variable(params["fc2.weight"].t())
        self.bias2 = Variable(params["fc2.bias"])
        
        #Network compression
        u,s,v = torch.svd(self.weight2)
        u,s,v = self.reshape_mat(u,s,v)
        
        self.weight2_hat,self.v2 = self.new_weights(u,s,v)
        self.v2=Parameter(self.v2,requires_grad=True)
        self.weight2_hat=Parameter(self.weight2_hat,requires_grad=True)
        
        #weight3
        self.weight3 = Variable(params["fc3.weight"].t())
        self.bias3 = Variable(params["fc3.bias"])
        
        #Network compression
        
        u,s,v = torch.svd(self.weight3)
        u,s,v = self.reshape_mat(u,s,v)
        
        self.weight3_hat,self.v3 = self.new_weights(u,s,v)
        self.v3=Parameter(self.v3,requires_grad=True)
        self.weight3_hat=Parameter(self.weight3_hat,requires_grad=True)
        
        #weight4
        self.weight4 = Variable(params["fc4.weight"].t())
        self.bias4 = Variable(params["fc4.bias"])
        
        #Network compression
        
        u,s,v = torch.svd(self.weight4)

        
        u,s,v = self.reshape_mat(u,s,v)
        
        self.weight4_hat ,self.v4 = self.new_weights(u,s,v)
        self.v4=Parameter(self.v4,requires_grad=True)
        self.weight4_hat=Parameter(self.weight4_hat,requires_grad=True)
        
        
        #weight5
        self.weight5 = Variable(params["fc5.weight"].t())
        self.bias5 = Variable(params["fc5.bias"])
        
        #Network compression
        
        u,s,v = torch.svd(self.weight5)
        u,s,v = self.reshape_mat(u,s,v)
        
        self.weight5_hat ,self.v5 = self.new_weights(u,s,v)
        self.v5=Parameter(self.v5,requires_grad=True)
        self.weight5_hat=Parameter(self.weight5_hat,requires_grad=True)
        
        #weight6
        self.weight6 = Variable(params["fc6.weight"].t())
        self.bias6 = Variable(params["fc6.bias"])
        
        
    def forward(self, x):
        
        
        hidden_1 = x.mm(self.weight1_hat).mm(self.v1)
        hidden_1 = hidden_1.add(self.bias1)
        hidden_1_relu = hidden_1.clamp(min=0)
        
        
        hidden_2 = hidden_1_relu.mm(self.weight2_hat).mm(self.v2)
        hidden_2 = hidden_2.add(self.bias2)
        hidden_2_relu = hidden_2.clamp(min=0)
        
        hidden_3 = hidden_2_relu.mm(self.weight3_hat).mm(self.v3)
        hidden_3 = hidden_3.add(self.bias3)
        hidden_3_relu = hidden_3.clamp(min=0)
        
        hidden_4 = hidden_3_relu.mm(self.weight4_hat).mm(self.v4)
        hidden_4 = hidden_4.add(self.bias4)
        hidden_4_relu = hidden_4.clamp(min=0)
        
        hidden_5 = hidden_4_relu.mm(self.weight5_hat).mm(self.v5)
        hidden_5 = hidden_5.add(self.bias5)
        hidden_5_relu = hidden_5.clamp(min=0)
        
        
        last_layer = hidden_5_relu.mm(self.weight6)
        last_layer = last_layer.add(self.bias6)

        return F.log_softmax(last_layer,dim=1)
    
    def reshape_mat(self,u,s,v):
        u = u[:,:self.D]
         
        s = s[:self.D]
        
        v = v[:,:self.D]
        
        return u,s,v
    
    def new_weights(self,u,s,v):
        
#         return torch.mm(u,torch.diag(s)).data,v.t().data
        return u.data,torch.mm(torch.diag(s),v.t()).data


In [39]:
model2 = Test_Model(model)
criterion2 = torch.nn.NLLLoss()


In [40]:
def find_rank_test():
    

    test_loss = 0
    correct = 0
    

    for i,(data,target) in enumerate(test_loader):
        data = data.view(-1,28*28)
        
        if torch.cuda.is_available():
            data,target = data.cuda(),target.cuda()
        data,target = Variable(data),Variable(target)
        
        y_pred = model2(data)
        
        
        test_loss += criterion2(y_pred, target).data[0]
        pred = y_pred.data.max(1)[1]
        correct += pred.eq(target.data).sum()
    
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.001f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * float(correct) / len(test_loader.dataset)))
   

In [41]:
find_rank_test()


Test set: Average loss: 0.0030, Accuracy: 9396/10000 (94.0%)



In [42]:
#Code to save best weights
# torch.save(model.state_dict(),"optimum_wts")

#### The svd model weights are given to the Model_Three network

In [43]:
class Model_Three(torch.nn.Module):
    
    def __init__(self,base_model,svd_model):
        super(Model_Three, self).__init__()
        
        
        #weight1 and bias
        params_one = base_model.state_dict()
        params_two = svd_model.state_dict()
        
        #Get U and S.V^T from the models
        self.weight1_hat,self.v1 = Parameter(params_two["weight1_hat"],requires_grad=True),Parameter(params_two["v1"],requires_grad=True)
        self.weight2_hat,self.v2 = Parameter(params_two["weight2_hat"],requires_grad=True),Parameter(params_two["v2"],requires_grad=True)
        self.weight3_hat,self.v3 = Parameter(params_two["weight3_hat"],requires_grad=True),Parameter(params_two["v3"],requires_grad=True)
        self.weight4_hat,self.v4 = Parameter(params_two["weight4_hat"],requires_grad=True),Parameter(params_two["v4"],requires_grad=True)
        self.weight5_hat,self.v5 = Parameter(params_two["weight5_hat"],requires_grad=True),Parameter(params_two["v5"],requires_grad=True)
        #Final layer weight update is not needed. So we will not attach it to this models parameters
        self.weight6 = params_one["fc6.weight"].t()
                                     
                                     
        self.bias1 = Parameter(params_one["fc1.bias"],requires_grad=True)
        self.bias2 = Parameter(params_one["fc2.bias"],requires_grad=True)
        self.bias3 = Parameter(params_one["fc3.bias"],requires_grad=True)
        self.bias4 = Parameter(params_one["fc4.bias"],requires_grad=True)
        self.bias5 = Parameter(params_one["fc5.bias"],requires_grad=True)
        self.bias6 = Parameter(params_one["fc6.bias"],requires_grad=True)
        
        
        
        
    def forward(self, x):
        
        
        hidden_1 = x.mm(self.weight1_hat).mm(self.v1)
        hidden_1 = hidden_1.add(self.bias1)
        hidden_1_relu = hidden_1.clamp(min=0)
        
        
        hidden_2 = hidden_1_relu.mm(self.weight2_hat).mm(self.v2)
        hidden_2 = hidden_2.add(self.bias2)
        hidden_2_relu = hidden_2.clamp(min=0)
        
        hidden_3 = hidden_2_relu.mm(self.weight3_hat).mm(self.v3)
        hidden_3 = hidden_3.add(self.bias3)
        hidden_3_relu = hidden_3.clamp(min=0)
        
        hidden_4 = hidden_3_relu.mm(self.weight4_hat).mm(self.v4)
        hidden_4 = hidden_4.add(self.bias4)
        hidden_4_relu = hidden_4.clamp(min=0)
        
        hidden_5 = hidden_4_relu.mm(self.weight5_hat).mm(self.v5)
        hidden_5 = hidden_5.add(self.bias5)
        hidden_5_relu = hidden_5.clamp(min=0)
        
        
        last_layer = hidden_5_relu.mm(Variable(self.weight6))
        last_layer = last_layer.add(self.bias6)

        return F.log_softmax(last_layer,dim=1)

In [44]:
model3 = Model_Three(model,model2)
if torch.cuda.is_available():
    model3=model3.cuda()
criterion3 = torch.nn.NLLLoss()

In [45]:
optimizer3 = torch.optim.Adam(model3.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08)

# optimizer3 = torch.optim.Adam(model3.parameters(), lr=0.0001)


In [46]:
#part 6.a) 
# We have defined the model required for 6.a as Test_Model let us learn U and v_hat using back propagation

def train_svd(epoch):
    for batch_idx,(data,target) in enumerate(train_loader):
        data=data.view(-1,28*28)

        if torch.cuda.is_available():
            data,target = data.cuda(),target.cuda()
        data,target = Variable(data),Variable(target)
        # Clears the gradients of all optimized Variables
        
        optimizer3.zero_grad()
        y_pred = model3(data)
        loss = criterion3(y_pred, target)
        loss.backward()
        optimizer3.step()

        if batch_idx % args.interval  == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        epoch, batch_idx * len(data), len(train_loader.dataset),
        100. * batch_idx / len(train_loader), loss.data[0]))

In [47]:
for t in range(25):
    train_svd(t) 



In [48]:
def test_svd():
    model3.eval()
    test_loss = 0
    correct = 0
    

    for i,(data,target) in enumerate(test_loader):
        data = data.view(-1,28*28)
        
        if torch.cuda.is_available():
            data,target = data.cuda(),target.cuda()
        data,target = Variable(data),Variable(target)
        
        y_pred = model3(data)
        
        
        test_loss += criterion3(y_pred, target).data[0]
        pred = y_pred.data.max(1)[1]
        correct += pred.eq(target.data).sum()
    
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.001f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * float(correct) / len(test_loader.dataset)))
   

In [49]:
test_svd()


Test set: Average loss: 0.0012, Accuracy: 9797/10000 (98.0%)



#### The test accuracies after BP the svd, is 98% .