DiracNets
=========

In this notebook we provide DiracNet-18-0.75 (12.8M parameters) model definitions with pretrained weights.
The model was trained using functional API of PyTorch on ILSVRC2012 train set.

top-1 and top-5 errors on ILSVRC2012 validation set:
**32.29, 12.16**

We saved the weights in hdf5 format, so that they can be loaded in other frameworks
without PyTorch.

In [182]:
import hickle as hkl
import torch
import torch.nn.functional as F
from torch.autograd import Variable

# Batch normalization and dirac parameterization folded into convolutional filters

### load weights

In [183]:
params = hkl.load('./diracnet-18-0.75-br-export.hkl')

# convert numpy arrays to torch Variables
for k,v in sorted(params.items()):
    print k, v.shape
    params[k] = Variable(torch.from_numpy(v), requires_grad=True)
    
print '\nTotal parameters:', sum(v.numel() for v in params.values())

conv.bias (48,)
conv.weight (48, 3, 7, 7)
fc.bias (1000,)
fc.weight (1000, 384)
group0.block0.conv.bias (48,)
group0.block0.conv.weight (48, 96, 3, 3)
group0.block1.conv.bias (48,)
group0.block1.conv.weight (48, 96, 3, 3)
group0.block2.conv.bias (48,)
group0.block2.conv.weight (48, 96, 3, 3)
group0.block3.conv.bias (48,)
group0.block3.conv.weight (48, 96, 3, 3)
group1.block0.conv.bias (96,)
group1.block0.conv.weight (96, 96, 3, 3)
group1.block1.conv.bias (96,)
group1.block1.conv.weight (96, 192, 3, 3)
group1.block2.conv.bias (96,)
group1.block2.conv.weight (96, 192, 3, 3)
group1.block3.conv.bias (96,)
group1.block3.conv.weight (96, 192, 3, 3)
group2.block0.conv.bias (192,)
group2.block0.conv.weight (192, 192, 3, 3)
group2.block1.conv.bias (192,)
group2.block1.conv.weight (192, 384, 3, 3)
group2.block2.conv.bias (192,)
group2.block2.conv.weight (192, 384, 3, 3)
group2.block3.conv.bias (192,)
group2.block3.conv.weight (192, 384, 3, 3)
group3.block0.conv.bias (384,)
group3.block0.conv.wei

# Functional definition

In [184]:
def define_diracnet(depth):
    definitions = {18: [2,2,2,2], 34: [3,4,6,5]}
    blocks = definitions[depth]
    
    def ncrelu(x):
        return torch.cat([x.clamp(min=0), x.clamp(max=0)], dim=1)

    def group(o, params, base, count):
        for i in range(count):
            o = F.conv2d(ncrelu(o), padding=1,
                         weight=params['%s.block%d.conv.weight' % (base, i)],
                         bias=params['%s.block%d.conv.bias' % (base, i)])
        return o
    
    def f(inputs, params):
        o = F.conv2d(inputs, params['conv.weight'], params['conv.bias'], padding=3, stride=2)
        o = F.max_pool2d(o, 3, 2, 1)
        o = group(o, params, 'group0', blocks[0] * 2)
        o = F.max_pool2d(o, 2)
        o = group(o, params, 'group1', blocks[1] * 2)
        o = F.max_pool2d(o, 2)
        o = group(o, params, 'group2', blocks[2] * 2)
        o = F.max_pool2d(o, 2)
        o = group(o, params, 'group3', blocks[3] * 2)
        o = F.avg_pool2d(F.relu(o), o.size(-1))
        o = F.linear(o.view(o.size(0), -1), params['fc.weight'], params['fc.bias'])
        return o
    
    return f

In [185]:
inputs = torch.randn(1,3,224,224)
y = define_diracnet(18)(Variable(inputs), params)
print y

Variable containing:
 0.3973  1.1641 -0.9560  ...   0.1217 -0.5293  1.5345
[torch.FloatTensor of size 1x1000]



# Module definition

In [186]:
from torch import nn

# ugh modules are annoying

class NCReLU(nn.Module):
    def forward(self, x):
        return torch.cat([x.clamp(min=0), x.clamp(max=0)], dim=1)
    
    def __repr__(self):
        return 'NCReLU()'


class Flatten(nn.Module):
    
    def forward(self, x):
        return x.view(x.size(0), -1)
    
    def __repr__(self):
        return 'Flatten()'

    
model = nn.Sequential()
model.add_module('conv', nn.Conv2d(3, 48, kernel_size=7, stride=2, padding=3))
model.add_module('max_pool0', nn.MaxPool2d(3, 2, 1))
for i in range(4):
    model.add_module('group0.block%d.ncrelu' % i, NCReLU())
    model.add_module('group0.block%d.conv' % i, nn.Conv2d(96, 48, kernel_size=3, padding=1))
model.add_module('max_pool1', nn.MaxPool2d(2))
for i in range(4):
    model.add_module('group1.block%d.ncrelu' % i, NCReLU())
    model.add_module('group1.block%d.conv' % i, nn.Conv2d(96 if i==0 else 192, 96, kernel_size=3, padding=1))
model.add_module('max_pool2', nn.MaxPool2d(2))
for i in range(4):
    model.add_module('group2.block%d.ncrelu' % i, NCReLU())
    model.add_module('group2.block%d.conv' % i, nn.Conv2d(192 if i==0 else 384, 192, kernel_size=3, padding=1))
model.add_module('max_pool3', nn.MaxPool2d(2))
for i in range(4):
    model.add_module('group3.block%d.ncrelu' % i, NCReLU())
    model.add_module('group3.block%d.conv' % i, nn.Conv2d(384 if i==0 else 768, 384, kernel_size=3, padding=1))
model.add_module('relu', nn.ReLU())
model.add_module('avg_pool', nn.AvgPool2d(7))
model.add_module('view', Flatten())
model.add_module('fc', nn.Linear(in_features=384, out_features=1000))

In [187]:
model.load_state_dict({k: v.data for k,v in params.items()})

Check against functional model:

In [188]:
x = Variable(inputs)
(model(x) - define_diracnet(18)(x, params)).abs().sum().data[0]

1.9073486328125e-06

In [189]:
print model

Sequential (
  (conv): Conv2d(3, 48, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (max_pool0): MaxPool2d (size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
  (group0.block0.ncrelu): NCReLU()
  (group0.block0.conv): Conv2d(96, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (group0.block1.ncrelu): NCReLU()
  (group0.block1.conv): Conv2d(96, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (group0.block2.ncrelu): NCReLU()
  (group0.block2.conv): Conv2d(96, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (group0.block3.ncrelu): NCReLU()
  (group0.block3.conv): Conv2d(96, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (max_pool1): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
  (group1.block0.ncrelu): NCReLU()
  (group1.block0.conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (group1.block1.ncrelu): NCReLU()
  (group1.block1.conv): Conv2d(192, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (

# Original definition

In [190]:
def define_train_diracnet(depth):
    definitions = {18: [2,2,2,2], 34: [3,4,6,5]}
    blocks = definitions[depth]
    
    def ncrelu(x):
        return torch.cat([x.clamp(min=0), x.clamp(max=0)], dim=1)
    
    def batch_norm(x, params, stats, base, mode):
        return F.batch_norm(x, weight=params[base + '.weight'],
                            bias=params[base + '.bias'],
                            running_mean=stats[base + '.running_mean'],
                            running_var=stats[base + '.running_var'],
                            training=mode)

    def block(o, params, stats, base, mode, j):
        name = '%s.conv' % base
        w = params[name]
        gamma = params[name + '.gamma'].expand_as(w)
        beta = params[name + '.beta'].view(-1,1,1,1).expand_as(w)
        eye = Variable(stats['eye' + '_'.join(map(str, w.size()))])
        w = beta * F.normalize(w.view(w.size(0), -1)).view_as(w) + gamma * eye
        o = F.conv2d(ncrelu(o), w, stride=1, padding=1)
        o = batch_norm(o, params, stats, '%s.bn' % base, mode)
        return o

    def group(o, params, stats, base, mode, count):
        for i in range(count):
            o = block(o, params, stats, '%s.block%d' % (base, i), mode, i)
        return o

    def f(inputs, params, stats, mode):
        o = F.conv2d(inputs, params['conv'], padding=3, stride=2)
        o = batch_norm(o, params, stats, 'bn', mode)
        o = F.max_pool2d(o, 3, 2, 1)
        o = group(o, params, stats, 'group0', mode, blocks[0] * 2)
        o = F.max_pool2d(o, 2)
        o = group(o, params, stats, 'group1', mode, blocks[1] * 2)
        o = F.max_pool2d(o, 2)
        o = group(o, params, stats, 'group2', mode, blocks[2] * 2)
        o = F.max_pool2d(o, 2)
        o = group(o, params, stats, 'group3', mode, blocks[3] * 2)
        o = F.avg_pool2d(F.relu(o), o.size(-1))
        o = o.view(o.size(0), -1)
        o = F.linear(o, params['fc.weight'], params['fc.bias'])
        return o
    
    return f

In [191]:
data = hkl.load('./diracnet-18-0.75-br.hkl')
params = data['params']
stats = data['stats']

# convert numpy arrays to torch Variables
for k,v in sorted(params.items()):
    print k, v.shape
    params[k] = Variable(torch.from_numpy(v), requires_grad=True)
for k,v in sorted(stats.items()):
    stats[k] = torch.from_numpy(v)
    
print '\nTotal parameters:', sum(v.numel() for v in params.values())

bn.bias (48,)
bn.weight (48,)
conv (48, 3, 7, 7)
fc.bias (1000,)
fc.weight (1000, 384)
group0.block0.bn.bias (48,)
group0.block0.bn.weight (48,)
group0.block0.conv (48, 96, 3, 3)
group0.block0.conv.beta (48,)
group0.block0.conv.gamma (1,)
group0.block1.bn.bias (48,)
group0.block1.bn.weight (48,)
group0.block1.conv (48, 96, 3, 3)
group0.block1.conv.beta (48,)
group0.block1.conv.gamma (1,)
group0.block2.bn.bias (48,)
group0.block2.bn.weight (48,)
group0.block2.conv (48, 96, 3, 3)
group0.block2.conv.beta (48,)
group0.block2.conv.gamma (1,)
group0.block3.bn.bias (48,)
group0.block3.bn.weight (48,)
group0.block3.conv (48, 96, 3, 3)
group0.block3.conv.beta (48,)
group0.block3.conv.gamma (1,)
group1.block0.bn.bias (96,)
group1.block0.bn.weight (96,)
group1.block0.conv (96, 96, 3, 3)
group1.block0.conv.beta (96,)
group1.block0.conv.gamma (1,)
group1.block1.bn.bias (96,)
group1.block1.bn.weight (96,)
group1.block1.conv (96, 192, 3, 3)
group1.block1.conv.beta (96,)
group1.block1.conv.gamma (1,)


In [192]:
inputs = torch.randn(1,3,224,224)
y = define_train_diracnet(18)(Variable(inputs), params, stats, False)
print y

Variable containing:
 0.2164  1.5824 -0.9205  ...   0.5037 -0.3045  1.9326
[torch.FloatTensor of size 1x1000]



In [193]:
x = Variable(inputs)
(model(x) - define_train_diracnet(18)(x, params, stats, False)).abs().sum().data[0]

0.0019147731363773346