In [2]:
import copy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker

import torch
import torch.nn as nn
import torch.optim as optim
import time
from datetime import datetime

import torchvision
from torchvision import datasets, transforms, models
import torch.nn.functional as F

from sklearn.metrics import accuracy_score

# Defining the CP-Decomposition function

In [3]:
import tensorly as tl
from tensorly.decomposition import parafac
import numpy as np

approx = 0  # Approximation error (mean approximation if multiple decompositions)
approx_list = []  # List of approximations in case of several decompositions
dim_drop = 0  # Parameters reduction
original_param_list = []  # List of original parameters (in case of multiple layers)
new_param_list = []  # List of new parameters (in case of multiple layers)

def cp_decomposition_conv_layer(layer, rank):

    t = tl.tensor(layer.weight.data)
    a, b, c, d = np.shape(t)
    dim_t = a * b * c * d

    # Perform CP decomposition on the layer weight tensorly.
    dec = parafac(t, rank=rank, init='svd')

    recomp_tensor = tl.kruskal_to_tensor(dec)

    norm_t = np.linalg.norm(t)
    approx_list.append(np.linalg.norm(recomp_tensor - t) / norm_t)
    approx = np.mean(approx_list)

    original_param_list.append(dim_t)
    new_param_list.append(rank * (a + b + c + d))
    dim_drop = sum(original_param_list) / sum(new_param_list)

    for i in range(len(dec.factors)):
        dec.factors[i] = torch.tensor(dec.factors[i])

    last, first, vertical, horizontal = dec.factors

    pointwise_s_to_r_layer = torch.nn.Conv2d(in_channels=first.shape[0],
                                             out_channels=first.shape[1], kernel_size=1, stride=1, padding=0,
                                             dilation=layer.dilation, bias=False)

    depthwise_vertical_layer = torch.nn.Conv2d(in_channels=vertical.shape[1],
                                               out_channels=vertical.shape[1], kernel_size=(vertical.shape[0], 1),
                                               stride=1, padding=(layer.padding[0], 0), dilation=layer.dilation,
                                               groups=vertical.shape[1], bias=False)

    depthwise_horizontal_layer = \
        torch.nn.Conv2d(in_channels=horizontal.shape[1],
                        out_channels=horizontal.shape[1],
                        kernel_size=(1, horizontal.shape[0]), stride=layer.stride,
                        padding=(0, layer.padding[0]),
                        dilation=layer.dilation, groups=horizontal.shape[1], bias=False)

    pointwise_r_to_t_layer = torch.nn.Conv2d(in_channels=last.shape[1],
                                             out_channels=last.shape[0], kernel_size=1, stride=1,
                                             padding=0, dilation=layer.dilation, bias=True)

    pointwise_r_to_t_layer.bias.data = layer.bias.data

    depthwise_horizontal_layer.weight.data = \
        torch.transpose(horizontal, 1, 0).unsqueeze(1).unsqueeze(1)
    depthwise_vertical_layer.weight.data = \
        torch.transpose(vertical, 1, 0).unsqueeze(1).unsqueeze(-1)
    pointwise_s_to_r_layer.weight.data = \
        torch.transpose(first, 1, 0).unsqueeze(-1).unsqueeze(-1)
    pointwise_r_to_t_layer.weight.data = last.unsqueeze(-1).unsqueeze(-1)

    new_layers = [pointwise_s_to_r_layer, depthwise_vertical_layer,
                  depthwise_horizontal_layer, pointwise_r_to_t_layer]

    return nn.Sequential(*new_layers), approx, dim_drop

In [4]:
def _decompose(model, rank, layer = 3):
    model = model.cpu()

    # getting 2-nd convolutional layer
    layer2 = model.features[layer]

    #decomposing layers
    decomp_layers, aprx, ddrop = cp_decomposition_conv_layer(layer2, rank)

    #building new Sequential layer in the right order
    decomp_features = nn.Sequential(
        model.features[0],
        model.features[1],
        model.features[2],
        decomp_layers[0],
        decomp_layers[1],
        decomp_layers[2],
        decomp_layers[3],
        model.features[4],
        model.features[5],
        model.features[6],
        model.features[7],
        model.features[8],
        model.features[9],
        model.features[10],
        model.features[11],
        model.features[12]
    )

    #changing the model features Sequential with the decomposed one
    model.features = decomp_features
    
    return model, aprx, ddrop

# Getting cifar10 dataset and normalizing it

In [5]:
mean = np.array([0.485, 0.456, 0.406])
std  = np.array([0.229, 0.224, 0.225])

In [6]:
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean = mean, std = std)
])

In [7]:
test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean = mean, std = std)
])

In [8]:
data_dir = 'datasets/cifar10/'
batch_size = 8
workers = 1

In [9]:
trainset = datasets.CIFAR10(root=data_dir+'train',
                          train=True,
                          download=True,
                          transform=train_transform)
testset = datasets.CIFAR10(root=data_dir+'test',
                          train=False,
                          download=True,
                          transform=test_transform)

Files already downloaded and verified
Files already downloaded and verified


In [10]:
trainloader = torch.utils.data.DataLoader(trainset,
                                         batch_size=batch_size,
                                         shuffle=True,
                                         #num_workers=workers
                                         )
testloader = torch.utils.data.DataLoader(testset,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         #num_workers=workers
                                        )
loaders = {
    'train':trainloader,
    'test' :testloader
}

In [11]:
dataset_sizes = {'train': len(trainloader) , 'test': len(testloader)}
dataset_sizes

{'train': 6250, 'test': 1250}

# Defining the Fine-tuning & Testing function

In [12]:
def _finetune(model, criterion, optimizer, device,  num_epochs=25):
    model.to(device)
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            total_ = 0
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for idx, (inputs, labels) in enumerate(loaders[phase]):
                
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                        if idx % 1900 == 0:
                            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                                epoch, idx * len(inputs), len(loaders['train'].dataset),
                               100. * idx / len(loaders['train']), loss.item()))

                # statistics
                running_loss += loss.item()
                running_corrects += torch.sum(preds == labels.data)
                
                total_ += labels.size(0)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / total_

            print('{} Loss: {:.4f} Acc: {:.4f}%'.format(
                phase, epoch_loss, epoch_acc*100))

            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}%'.format(best_acc*100))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [13]:
    def _test(model, device, x_test = loaders['test']):
        model.to(device)
        start = time.time()
        model.eval()  # network in evaluation mode (for batchnorm and dropout layers)
        test_loss = 0
        correct = 0
        with torch.no_grad():  # deactivate the autograd engine to reduce memory usage and speed up
            for (data, target) in x_test:
                data = data.to(device)
                target = target.to(device)
                output = model.forward(data)  # prediction with the CharNet
                test_loss += F.cross_entropy(output, target).item()  # Add the negative log likelihood loss.
                pred = output.data.max(1, keepdim=True)[1]
                correct += pred.eq(target.data.view_as(pred)).sum()
        test_loss /= len(x_test.dataset)
        score = int(correct) / len(x_test.dataset)
        
        print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
            test_loss, correct, len(x_test.dataset),
            100. * score))
        
        #return test_loss, score, time.time() - start
        return score, time.time() - start

# Training ALEXNET

In [15]:
want_to_train = False
want_to_freeze_layers = False

if want_to_train:
    #loading pretrained model
    model = models.alexnet(pretrained=True)
    
    #freezing layers
    if want_to_freeze_layers:
        for param in model.parameters():
            param.requires_grad = False
            
    #changing its classifier output
    model.classifier[6] = nn.Linear(4096, 10)
    
    #defining training parameters
    criterion = nn.CrossEntropyLoss()
    lr = 0.00001
    num_epochs = 1
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    #training
    model = _finetune(model, criterion, optimizer, device, num_epochs=num_epochs)
    
    #saving the model
    model.to('cpu')
    torch.save(model.state_dict(), './models/alexnet/alexnet_' + datetime.now().strftime("%d-%m-%Y_%H:%M:%S") +'.pth')

# CP-Decomposing and fine-tunning second convolution of ALEXNET

In [36]:
adress = './models/best/alexnet/alexnet_89.pth'

# Loading pretrained & finetuned ALEXNET and setting the output layer 
model = models.alexnet()
model.classifier[6] = nn.Linear(4096, 10)
model.load_state_dict(torch.load(adress))

# parameters
ranks_to_decomp = [4,8,16,32,64,128,256,512]
layer_to_decomp = 3 #the 2nd conv layer of alexnet is 3
n_epochs = 10
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# recording computing time of the original model
tot_time=0
for iter in range(4):
    old_acc, old_computation_time = _test(model, "cpu")
    tot_time += old_computation_time
    
old_computation_time = tot_time / 4

approximation_list, dim_drop_list, acc_list, computation_list, acc_ft_list = [], [], [], [], []


for rank in ranks_to_decomp:
    # Loading pretrained & finetuned ALEXNET and setting the output layer 
    model = models.alexnet()
    model.classifier[6] = nn.Linear(4096, 10)
    model.load_state_dict(torch.load(adress))

    model, approximation, dim_drop = _decompose(model, rank, layer_to_decomp)
    
    # recording computing time of the decomposed model
    tot_time = 0
    for iter in range(4):
        new_acc, new_computation_time = _test(model, "cpu")
        tot_time += new_computation_time
    new_computation_time = tot_time / 4
    
    #calculating acc drop and speed up
    accuracy_drop = (old_acc - new_acc) / old_acc
    speed_up = (old_computation_time - new_computation_time) / old_computation_time

    approximation_list.append(100 * approximation)
    dim_drop_list.append(dim_drop)
    acc_list.append(100 * accuracy_drop)
    computation_list.append(100 * speed_up)

    print("Rank: {}, Approx error : {:.2f} %,  Acc drop : {:.2f} %, "
                  "Speed-up : {:.2f} %, Param red : {:.3f}\n".format(rank, 100 * approximation,
                                                                          100 * accuracy_drop, 100 * speed_up,
                                                                          dim_drop))

    # fine-tunning model
    optimizer = optim.Adam(model.parameters(), lr=0.00001)
    model = _finetune(model, criterion, optimizer, device, num_epochs=n_epochs)

    ft_acc, ft_computation_time = _test(model, device)
    accuracy_drop = (old_acc - ft_acc) / old_acc
    acc_ft_list.append(100 * accuracy_drop)


Test set: Avg. loss: 0.0393, Accuracy: 8888/10000 (89%)

Test set: Avg. loss: 0.0393, Accuracy: 8888/10000 (89%)

Test set: Avg. loss: 0.0393, Accuracy: 8888/10000 (89%)

Test set: Avg. loss: 0.0393, Accuracy: 8888/10000 (89%)
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 4, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (4): Conv2d(4, 4, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0), groups=4, bias=False)
    (5): Conv2d(4, 4, kernel_size=(1, 5), stride=(1, 1), padding=(0, 2), groups=4, bias=False)
    (6): Conv2d(4, 192, kernel_size=(1, 1), stride=(1, 1))
    (7): ReLU(inplace=True)
    (8): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU(inplace=True)
    (11): Co

RuntimeError: CUDA out of memory. Tried to allocate 144.00 MiB (GPU 0; 10.91 GiB total capacity; 244.88 MiB already allocated; 59.88 MiB free; 250.00 MiB reserved in total by PyTorch)

# Testing the model

In [18]:
_test(model, device)


Test set: Avg. loss: 0.0393, Accuracy: 8888/10000 (89%)


(0.8888, 14.521871328353882)

# Plotting results

In [15]:
x = ranks_to_decomp

formatter = ticker.ScalarFormatter()
formatter.set_scientific(False)

x_labels = [4, 16, 64,  256]
err_ticks = [100, 80, 60 ,40, 20]
err_ticks1 = [100, 80, 60, 40, 30, 20 ,0]


fig, ax = plt.subplots(ncols=4, nrows=1, figsize=(16, 4))
ax = ax.flatten()

plt.suptitle('AlexNet', fontsize=18)

ax[0].plot(x, approximation_list, marker='o', mfc='none',  c='#3838D0')
ax[0].set_xlabel('Approximation Error (%)')
ax[0].set_xscale('log')
ax[0].xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:g}'.format(y)))
ax[0].set_xticks(x_labels)
ax[0].set_yticks(err_ticks)


ax[1].plot(x, acc_list, marker='o', mfc='none', c='red')
ax[1].plot(x, acc_ft_list, 'r--', marker='o', mfc='none')
ax[1].set_xlabel('Accuracy Drop (%)')
ax[1].set_xscale('log')
ax[1].xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:g}'.format(y)))
ax[1].set_xticks(x_labels)
ax[1].set_yscale('linear')
ax[1].set_yticks([100, 50, 10, 1])
ax[1].yaxis.set_major_formatter(formatter)
ax[1].set_ylim([0,100])

ax[2].plot(x, computation_list, marker='o', mfc='none', c='green')
ax[2].set_xlabel('Speed-up (%)')
ax[2].set_xscale('log')
ax[2].xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:g}'.format(y)))
ax[2].set_xticks(x_labels)
ax[2].set_yscale('linear')
ax[2].yaxis.set_major_formatter(formatter)
ax[2].set_ylim([0,40])


ax[3].plot(x, dim_drop_list, marker='o', mfc='none', c='#2C89D0')
ax[3].set_xlabel('Parameters reduction (x)')
ax[3].set_xscale('log')
ax[3].xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:g}'.format(y)))
ax[3].set_xticks(x_labels)
ax[3].set_ylim([0,300])

plt.show()

NameError: name 'ranks_to_decomp' is not defined

In [16]:
adress = './models/best/alexnet/alexnet_89.pth'

# Loading pretrained & finetuned ALEXNET and setting the output layer 
model = models.alexnet()
model.classifier[6] = nn.Linear(4096, 10)
model.load_state_dict(torch.load(adress))

<All keys matched successfully>