In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import numpy as np
import math
import time
from datetime import datetime
import os
from torch.utils.data import Dataset, TensorDataset

In [2]:
train_losses = []
test_results_exp = []
test_accs = []

In [3]:
device = torch.device("cuda")
batch_size=128
workers=8
lr=0.05
momentum=0.9
weight_decay=5e-4
print_freq=20

In [4]:
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 2 every 30 epochs"""
    lrt = lr * (0.5 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lrt

In [5]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

train_loader = torch.utils.data.DataLoader(
    datasets.CIFAR10(root='data', train=True, transform=transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32, 4),
        transforms.ToTensor(),
        normalize,
    ]), download=True),
    batch_size=batch_size, shuffle=True,
    num_workers=workers, pin_memory=True)

val_loader = torch.utils.data.DataLoader(
    datasets.CIFAR10(root='data', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=batch_size, shuffle=False,
    num_workers=workers, pin_memory=True)

Files already downloaded and verified


In [6]:
class Seq_CNN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, group, stride=1, padding=0, bias=True,depth = False):
        super(Seq_CNN, self).__init__()

        def hook_fn_backward_variable_cnn_u(grad):
            grad.mul_(self.grad_mask_u)    
        def hook_fn_backward_variable_cnn_v(grad):
            grad.mul_(self.grad_mask_v)    
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.group_in = int(np.ceil(in_channels/group))
        self.group_out = int(np.ceil(out_channels/group))
        self.kernel_size = kernel_size
        self.group = group
        self.stride = stride
        self.padding = padding 
        self.depth=depth
        
        if self.depth:
            base_u = torch.Tensor(self.group_out,1,kernel_size,kernel_size)
            base_v = torch.Tensor(self.group_out,1,kernel_size,kernel_size)            
        else:
            base_u = torch.Tensor(self.group_out,self.group_in,kernel_size,kernel_size)
            base_v = torch.Tensor(self.group_out,self.group_out,kernel_size,kernel_size)
        nn.init.kaiming_uniform_(base_u, a=math.sqrt(5))
        nn.init.kaiming_uniform_(base_v, a=math.sqrt(5))

        self.param_u = nn.Parameter(base_u)
        self.param_v = nn.Parameter(base_v)

        self.alpha = nn.Parameter(torch.tensor(0.5))
        self.beta = nn.Parameter(torch.tensor(0.5))

        if bias:
            self.bias_u = nn.Parameter(torch.Tensor(self.group_out))
            self.bias_v = nn.Parameter(torch.Tensor(self.group_out))
        else:
            self.register_parameter('bias_u', None)
            self.register_parameter('bias_v', None)
            self.reset_parameters()


    def ode_compute(self, x, h):
        return self.alpha*F.relu(x+h)-self.beta*h
      

    def reset_parameters(self) -> None:
        #nn.init.kaiming_uniform_(self.kernel_u, a=math.sqrt(5))
        #nn.init.kaiming_uniform_(self.kernel_v, a=math.sqrt(5))
        if self.bias_u is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.param_u)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias_u, -bound, bound)
            nn.init.uniform_(self.bias_v, -bound, bound)

    def forward(self,x,h=None):
        x_dup = torch.cat([x,x],dim=1)[:,:self.group_in*self.group]
        #x_dup = x_dup[:,self.perm]
        split = self.group_in
        x_pads = torch.split(x_dup, split,dim=1)
        outs = []
        for xp in x_pads:
            if len(outs)==0:
                if self.depth:
                    outs.append(F.conv2d(xp, self.param_u,self.bias_u,self.stride,self.padding,groups=self.group_in))
                else:
                    outs.append(F.conv2d(xp, self.param_u,self.bias_u,self.stride,self.padding,groups=1))
            else:
                if self.depth:
                    o_c = F.conv2d(xp,self.param_u,self.bias_u,self.stride,self.padding,groups=self.group_in)
                    o_l = F.conv2d(outs[-1],self.param_v,self.bias_v,1,(self.kernel_size-1)//2,groups=self.group_out)
                else:
                    o_c = F.conv2d(xp,self.param_u,self.bias_u,self.stride,self.padding)
                    o_l = F.conv2d(outs[-1],self.param_v,self.bias_v,1,(self.kernel_size-1)//2)
                outs.append(self.ode_compute(o_c,o_l)+o_l)
        cout = torch.cat(outs,dim=1)[:,:self.out_channels]
        return cout

In [8]:
class BaseBlock(nn.Module):
    alpha = 1

    def __init__(self, input_channel, output_channel, t = 6, downsample = False):
        """
            t:  expansion factor, t*input_channel is channel of expansion layer
            alpha:  width multiplier, to get thinner models
            rho:    resolution multiplier, to get reduced representation
        """ 
        super(BaseBlock, self).__init__()
        self.stride = 2 if downsample else 1
        self.downsample = downsample
        self.shortcut = (not downsample) and (input_channel == output_channel) 

        # apply alpha
        input_channel = int(self.alpha * input_channel)
        output_channel = int(self.alpha * output_channel)
        
        # for main path:
        c  = t * input_channel
        # 1x1   point wise conv
        #self.conv1 = nn.Conv2d(input_channel, c, kernel_size = 1, bias = False)
        self.conv1 = Seq_CNN(input_channel,c,kernel_size=1,group=3,stride=1,padding=0,bias=False)
        self.bn1 = nn.BatchNorm2d(c)
        # 3x3   depth wise conv
        self.conv2 = nn.Conv2d(c, c, kernel_size = 3, stride = self.stride, padding = 1, groups = c, bias = False)
        #self.conv2 = Seq_CNN(c,c ,kernel_size=3,group=3,stride=self.stride,padding=1,bias=False,depth=True) 
        self.bn2 = nn.BatchNorm2d(c)
        # 1x1   point wise conv
        if input_channel>0:
        #self.conv3 = nn.Conv2d(c, output_channel, kernel_size = 1, bias = False)
            self.conv3 = Seq_CNN(c,output_channel,kernel_size=1,group=4,stride=1,padding=0,bias=False)
        else:
            self.conv3 = nn.Conv2d(c, output_channel, kernel_size = 1, bias = False)
        self.bn3 = nn.BatchNorm2d(output_channel)
        

    def forward(self, inputs):
        # main path
        x = F.relu6(self.bn1(self.conv1(inputs)), inplace = True)
        x = F.relu6(self.bn2(self.conv2(x)), inplace = True)
        x = self.bn3(self.conv3(x))

        # shortcut path
        x = x + inputs if self.shortcut else x

        return x

In [14]:
class MobileNetV2(nn.Module):
    def __init__(self, output_size, alpha = 1):
        super(MobileNetV2, self).__init__()
        self.output_size = output_size

        # first conv layer 
        self.conv0 = nn.Conv2d(3, int(32*alpha), kernel_size = 3, stride = 1, padding = 1, bias = False)
        self.bn0 = nn.BatchNorm2d(int(32*alpha))

        # build bottlenecks
        BaseBlock.alpha = alpha
        self.bottlenecks = nn.Sequential(
            BaseBlock(32, 16, t = 1, downsample = False),
            BaseBlock(16, 24, downsample = False),
            BaseBlock(24, 24),
            BaseBlock(24, 32, downsample = False),
            BaseBlock(32, 32),
            BaseBlock(32, 32),
            BaseBlock(32, 64, downsample = True),
            BaseBlock(64, 64),
            BaseBlock(64, 64),
            BaseBlock(64, 64),
            BaseBlock(64, 96, downsample = False),
            BaseBlock(96, 96),
            BaseBlock(96, 96),
            BaseBlock(96, 160, downsample = True),
            BaseBlock(160, 160),
            BaseBlock(160, 160),
            BaseBlock(160, 320, downsample = False))

        # last conv layers and fc layer
        self.conv1 = nn.Conv2d(int(320*alpha), 1280, kernel_size = 1, bias = False)
        #self.conv1 = Seq_CNN(int(320*alpha),1280,kernel_size=1,group=4,stride=1,padding=0,bias=False)
        self.bn1 = nn.BatchNorm2d(1280)
        self.fc = nn.Linear(1280, output_size)

        # weights init
        self.weights_init()


    def weights_init(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))

            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()


    def forward(self, inputs):

        # first conv layer
        x = F.relu6(self.bn0(self.conv0(inputs)), inplace = True)
        # assert x.shape[1:] == torch.Size([32, 32, 32])

        # bottlenecks
        x = self.bottlenecks(x)
        # assert x.shape[1:] == torch.Size([320, 8, 8])

        # last conv layer
        x = F.relu6(self.bn1(self.conv1(x)), inplace = True)
        # assert x.shape[1:] == torch.Size([1280,8,8])

        # global pooling and fc (in place of conv 1x1 in paper)
        x = F.adaptive_avg_pool2d(x, 1)
        x = x.view(x.shape[0], -1)
        x = self.fc(x)

        return x

In [15]:
def train_eval_rnn(model,device, train_loader,test_loader, optimizer,criterion, epoch, eval_mode='on'):
    global best
    model.train()
    final_cal = nn.Sigmoid()
    train_error_logs = []
    t1 = datetime.now()
    test_loss_log = []
    test_acc_log = []
    train_corrects = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output_train = model(data)
        loss = criterion(output_train, target)
        loss.backward()
        optimizer.step()
        pred_train = output_train.argmax(dim=1, keepdim=True) # get the index of the max log-probability
        train_corrects += pred_train.eq(target.view_as(pred_train)).sum().item()
        train_error_logs.append(loss.item())
        
        if (batch_idx+0) % 20 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tTime: {:.2f}'.format(
                epoch, (batch_idx+0)* len(target), len(train_loader.dataset),
                100. * (batch_idx+0) / len(train_loader), loss,(datetime.now()-t1).total_seconds()))

        
    print('Train Epoch: {} Accuracy: {}/{} ({:.2f}%)\n'.format(
                epoch, train_corrects, len(train_loader.dataset),
                100. * train_corrects / len(train_loader.dataset)))
    
    model.eval()
    test_loss = 0
    correct_test = 0
    with torch.no_grad():
        for data_test, target_test in test_loader:
            data_test, target_test = data_test.to(device), target_test.to(device)
            output_test = model(data_test)
            test_loss += criterion(output_test, target_test).item()  # sum up batch loss
            pred_test = output_test.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct_test += pred_test.eq(target_test.view_as(pred_test)).sum().item()
    test_loss /= len(test_loader.dataset)
    test_acc_log.append(correct_test/len(test_loader.dataset))
    test_loss_log.append(test_loss)
    print('Test set: Average loss: {:.8f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
            test_loss, correct_test, len(test_loader.dataset),
            100. * correct_test / len(test_loader.dataset)))
    if correct_test > best:
        best = correct_test
        torch.save(model,"mobilenetv2.mdl")
        print(best,"saved")
    return train_error_logs,test_loss_log,test_acc_log

In [16]:
model = MobileNetV2(10, alpha = 1).to(device)
#model = vgg.vgg11().cuda()
print(1-sum([torch.numel(li) for li in model.parameters()])/2237770)

0.4716740326306993


In [17]:
# define loss function (criterion) and pptimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr,
                                momentum=momentum,
                                weight_decay=weight_decay)
best=0

In [18]:
train_time= []
train_loss = []
train_losses.append(train_loss)
tests = []
test_acc = []
test_results_exp.append(tests)
test_accs.append(test_acc)
ratio = 0
for epoch in range(0,600):
    adjust_learning_rate(optimizer, epoch)
    t1 = datetime.now()
    train_error,test_error,test_acc_this = train_eval_rnn(model,device, train_loader,val_loader, optimizer,criterion,epoch,'on')
    train_loss.extend(train_error)
    tests.extend(test_error)
    test_acc.extend(test_acc_this)
    train_time.append((datetime.now()-t1).total_seconds())
    print((datetime.now()-t1).total_seconds())
    #tests.append(test_rnn(model, device, test_loader,criterion))



KeyboardInterrupt: 

In [31]:
    
def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    :param v:
    :param divisor:
    :param min_value:
    :return:
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v

def conv_3x3_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )

def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    :param v:
    :param divisor:
    :param min_value:
    :return:
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio,celltype):
        super(InvertedResidual, self).__init__()
        assert stride in [1, 2]

        hidden_dim = round(inp * expand_ratio)
        self.identity = stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False) if celltype=="cnn" else Seq_CNN(inp,hidden_dim,kernel_size=1,group=3,stride=1,padding=0,bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                #nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False) if celltype=="cnn" else Seq_CNN(hidden_dim,oup,kernel_size=1,group=4,stride=1,padding=0,bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.identity:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, num_classes=1000, width_mult=1.):
        super(MobileNetV2, self).__init__()
        # setting of inverted residual blocks
        self.cfgs = [
            # t, c, n, s
            [1,  16, 1, 1],
            [6,  24, 2, 2],
            [6,  32, 3, 2],
            [6,  64, 4, 2],
            [6,  96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        input_channel = _make_divisible(32 * width_mult, 4 if width_mult == 0.1 else 8)
        layers = [conv_3x3_bn(3, input_channel, 2)]
        # building inverted residual blocks
        block = InvertedResidual
        total = 0
        for t, c, n, s in self.cfgs:
            output_channel = _make_divisible(c * width_mult, 4 if width_mult == 0.1 else 8)
            for i in range(n):
                if total<=4:
                    layers.append(block(input_channel, output_channel, s if i == 0 else 1, t,"cnn"))
                else:
                    layers.append(block(input_channel, output_channel, s if i == 0 else 1, t,"seq"))
                input_channel = output_channel
                total+=1
        print(total)
        self.features = nn.Sequential(*layers)
        # building last several layers
        output_channel = _make_divisible(1280 * width_mult, 4 if width_mult == 0.1 else 8) if width_mult > 1.0 else 1280
        self.conv = conv_1x1_bn(input_channel, output_channel)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Linear(output_channel, num_classes)

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.conv(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

In [32]:
model = MobileNetV2()
#model = vgg.vgg11().cuda()
print(1-sum([torch.numel(li) for li in model.parameters()])/3504872,sum([torch.numel(li) for li in model.parameters()]))

17
0.2955291947894245 2469080


In [19]:
model

MobileNetV2(
  (features): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
        (3): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
       