In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:82% !important; }</style>"))

from tensorflow import keras

import larq
import larq_zoo

from larq.models import summary

In [None]:
model = larq_zoo.BinaryResNetE18()
summary(model)

In [None]:
keras.utils.plot_model(model, 'tmp.png', show_shapes=True)

# General notes
Layers with float32 weights will be approximated assuming int8 quantization. Biases and output shifts/scales are approximated according to standard XS3 practices.

In [None]:
def conv2d_shallowin_cnt(output_height, output_width, C_out, K_h):
    num_out_pixels = output_height * output_width
    return num_out_pixels * ((C_out//16) * (K_h * ((17 + 2) + 5 + 2) + 5 + 8) + 5)

def conv2d_deepin_cnt(output_height, output_width, C_in, C_out, K_h, K_w):
    num_out_pixels = output_height * output_width
    return num_out_pixels * ((C_out//16) * (K_h * (K_w*(C_in//32) * (17 + 2) + 5 + 2) + 5 + 8) + 5)

def max_pool_3x3_cnt(output_height, output_width, depth, price_per_comparison=44):
    # price_per_comparison is based on results of first benchmarking
    # a single output requires 9 input pixels
    # 9 vectors mean 4 + 2 + 1 + 1 comparisons per output pixel
    return output_height * output_width * (depth//32) * (4+2+1+1) * price_per_comparison

def add_cnt(height, width, depth, price_per_addition=9):
    # price_per_addition is based on the following asm prototype for vector add:
    # (assume vC is initialized with a vector of ones)
    # VLCLRDR
    # VSETC to 8 bit mode
    # VLMACC a
    # VLMACC b
    # VSETC to 16 bit mode
    # VLMUL to scale
    # VDEPTH8
    # VSTRPV lower half
    # loop
    return height * width * (depth//16) * price_per_addition

def binarize_cnt(height, width, depth, price_per_vector=5):
    # price_per_vector is based on the following asm prototype:
    # VLDR a
    # VLSUB zero_point_vector
    # VDETPH1
    # VSTRPV
    # loop
    return height * width * (depth//32) * price_per_vector

def ave_pool_2x2_cnt(output_height, output_width, depth, price_per_output_pixel=11):
    # price_per_output_pixel is based on the following asm prototype for 4-element averaging:
    # (assume vC is initialized with a vector of ones)
    # VLCLRDR
    # VSETC to 8 bit mode
    # VLMACC a
    # VLMACC b
    # VLMACC c
    # VLMACC d
    # VSETC to 16 bit mode
    # VLMUL/VLASHR to scale/divide
    # VDEPTH8
    # VSTRPV lower half
    # loop
    return output_height * output_width * (depth//16) * price_per_output_pixel

def global_ave_pool_cnt(size, depth):
    # estimate is based on the following prototype for averange pooling a single input channel group:
    # (assume vC is initialized with a vector of ones)
    # VLCLRDR
    # VSETC to 8 bit mode
    # loop size times
    #     VLMACC
    # VSETC to 16 bit mode
    # VLMUL/VLASHR to scale/divide
    # VDEPTH8
    # VSTRPV lower half
    # loop (next channel group)
    return (depth//16) * (size * 2 + 7)

def mem_load_cnt(size_in_bytes):
    mem_load_speed = 800 * 2**20 / 800e6 * 5  # DDR access is about 800MB/s, this is per instruction
    return int(size_in_bytes / mem_load_speed)

def fc_cnt(N_in, N_out):
    # assume the standard fully connected strategy
    return (N_out // 16) * ((N_in // 32) * (17 + 2) + 5 + 8) + 5

In [None]:
def print_mem(mem):
    print("Data memory fooptrint:")
    print(f"  Input:            {mem[0] / 1024: >6.1f} KB")
    print(f"  Output:           {mem[1] / 1024: >6.1f} KB")
    print(f"  Intermediate:     {mem[2] / 1024: >6.1f} KB")
    print("Layer parameters:")
    print(f"  Weights:          {mem[3] / 1024: >6.1f} KB")
    print(f"  Bias/shift/scale  {mem[4] / 1024: >6.1f} KB")
    if len(mem) > 5:
        print("Next layer's parameters:")
        print(f"  Weights:          {mem[5] / 1024: >6.1f} KB")
        print(f"  Bias/shift/scale  {mem[6] / 1024: >6.1f} KB")
    print(f"TOTAL:              {sum(mem) / 1024: >6.1f} KB")

### Stem
This is the sequence of operators before the first residual connection.
Input is 224x224x3 and output is 56x56x64.
The convolution, the activation, and the two batchnorms can be fused (second batchnorm normally commutes with maxpool).

NOTE: conv2d is 7x7 with 2x2 strides, without biases.
Since input is shallow, kernel tensor is zero padded for fast access (normal XS3 practice).

NOTE: maxpool2d is 3x3 with 2x2 strides.

NOTE: The intermediate ouput of the conv2s does not fit in memory, so a special implementation will be needed.
(The pooling needs to happen in parallel with the conv2d.)

In [None]:
stem_mem = [224**2 * 3 * 4/3, 56**2 * 64, 3 * 3 * 64,
            9408 * 8/7 * 4/3 , 64 * 2 * 4,
            36864 * 4/3 / 8, 64 * 2 * 4]
print_mem(stem_mem)

In [None]:
stem_cnt = [conv2d_shallowin_cnt(output_height=112, output_width=112, C_out=64, K_h=7),
            max_pool_3x3_cnt(output_height=56, output_width=56, depth=64)]
print("Instruction count estimates:")
print(f"  7x7 conv2d:  {stem_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 maxpool: {stem_cnt[1] * 1e-6: >1.3f} M")

### 1st residual block

There will be 4 of these blocks, each identical except the last one, which needs to preload more weights.
This changes the memory footprint for the fourth layer, but not the computational cost.
This larger memory footprint is shown below.

NOTE: the binary conv2d is 3x3 with 1x1 strides, without biases.
Since a single row of the kernel weights is only $3*64 < 256$ bits wide, we apply some zero padding in the width direction.

In [None]:
res_block_1_mem = [56**2 * 64, 56**2 * 64, 56**2 * 64 / 8,
                   36864 * 4/3 / 8, 64 * 2 * 4,
                   8192, 128 * 2 * 4]
print_mem(res_block_1_mem)

In [None]:
res_block_1_cnt = [binarize_cnt(height=56, width=56, depth=64),
                   conv2d_shallowin_cnt(output_height=56, output_width=56, C_out=64, K_h=3),
                   add_cnt(height=56, width=56, depth=64)]
print("Instruction count estimates:")
print(f"  binarization:  {res_block_1_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 conv2d:    {res_block_1_cnt[1] * 1e-6: >1.3f} M")
print(f"  addition:      {res_block_1_cnt[2] * 1e-6: >1.3f} M")

### 1st downsampling block, pooling branch

This can be executed first, and its result offloaded to flash temporarily, then load back when doing the addition with the result of the conv branch.

NOTE: the only intermediate is the output of the average pooling.

NOTE: the conv2d kernel is 1x1, and deepin/deepout strategy can be used.

In [None]:
ds_block_1_pool_mem = [56**2 * 64, 28**2 * 128, 28**2 * 64,
                       8192, 128 * 2 * 4,
                       73728 * 4/3 / 8, 128 * 2 * 4]
print_mem(ds_block_1_pool_mem)

In [None]:
ds_block_1_pool_cnt = [ave_pool_2x2_cnt(output_height=28, output_width=28, depth=64),
                       conv2d_deepin_cnt(output_height=28, output_width=28, C_in=64, C_out=128, K_h=1, K_w=1)]
print("Instruction count estimates:")
print(f"  Ave pooling:   {ds_block_1_pool_cnt[0] * 1e-6: >1.3f} M")
print(f"  1x1 conv2d:    {ds_block_1_pool_cnt[1] * 1e-6: >1.3f} M")

### 1st downsampling block, binarized conv2d branch

NOTE: the binary conv2d is 3x3 with 2x2 strides, without biases.
Since a single row of the kernel weights is only $3*64 < 256$ bits wide, we apply some zero padding in the width direction.

NOTE: the only intermediate is the output of the average pooling.

In [None]:
ds_block_1_conv_mem = [56**2 * 64, 28**2 * 128, 56**2 * 64 / 8,
                       73728 * 4/3 / 8, 128 * 2 * 4,
                       147456 * 4/3 / 8, 128 * 2 * 4]
print_mem(ds_block_1_conv_mem)

In [None]:
ds_block_1_conv_cnt = [binarize_cnt(height=56, width=56, depth=64),
                       conv2d_shallowin_cnt(output_height=28, output_width=28, C_out=128, K_h=3),
                       add_cnt(height=28, width=28, depth=128)]
print("Instruction count estimates:")
print(f"  binarization:  {ds_block_1_conv_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 conv2d:    {ds_block_1_conv_cnt[1] * 1e-6: >1.3f} M")
print(f"  addition:      {ds_block_1_conv_cnt[2] * 1e-6: >1.3f} M")

### 2nd residual block

There will be 3 of these blocks, each identical except the last one, which needs to preload more weights.
This changes the memory footprint for the third layer, but not the computational cost.
This larger memory footprint is shown below.

NOTE: the binary conv2d is 3x3 with 1x1 strides, without biases.
Since a single row of the kernel weights is only $3*128$ bits wide, we apply 128 bits of zero padding in the width direction and compute two horizontally adjacent pixels at a time.
The 3x3 kernel will therefore be modeled as a 6 x 2 shallowing convolution.

In [None]:
res_block_2_mem = [28**2 * 128, 28**2 * 128, 28**2 * 128 / 8,
                   147456 * 4/3 / 8, 128 * 2 * 4,
                   32768, 256 * 2 * 4]
print_mem(res_block_2_mem)

In [None]:
res_block_2_cnt = [binarize_cnt(height=28, width=28, depth=128),
                   conv2d_shallowin_cnt(output_height=28, output_width=28, C_out=128, K_h=6),
                   add_cnt(height=28, width=28, depth=128)]
print("Instruction count estimates:")
print(f"  binarization:  {res_block_2_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 conv2d:    {res_block_2_cnt[1] * 1e-6: >1.3f} M")
print(f"  addition:      {res_block_2_cnt[2] * 1e-6: >1.3f} M")

### 2nd downsampling block, pooling branch

This can be executed first, and its result offloaded to flash temporarily, then load back when doing the addition with the result of the conv branch.

NOTE: the only intermediate is the output of the average pooling.

NOTE: the conv2d kernel is 1x1, and deepin/deepout strategy can be used.

In [None]:
ds_block_2_pool_mem = [28**2 * 128, 14**2 * 256, 14**2 * 128,
                       32768, 256 * 2 * 4,
                       294912 * 4/3 / 8, 256 * 2 * 4]
print_mem(ds_block_2_pool_mem)

In [None]:
ds_block_2_pool_cnt = [ave_pool_2x2_cnt(output_height=14, output_width=14, depth=128),
                       conv2d_deepin_cnt(output_height=14, output_width=14, C_in=128, C_out=256, K_h=1, K_w=1)]
print("Instruction count estimates:")
print(f"  Ave pooling:   {ds_block_2_pool_cnt[0] * 1e-6: >1.3f} M")
print(f"  1x1 conv2d:    {ds_block_2_pool_cnt[1] * 1e-6: >1.3f} M")

### 2nd downsampling block, binarized conv2d branch

NOTE: the binary conv2d is 3x3 with 2x2 strides, without biases.
Since a single row of the kernel weights is only $3*128$ bits wide, we apply 128 bits of zero padding in the width direction and compute two horizontally adjacent pixels at a time.
The 3x3 kernel will therefore be modeled as a 6 x 2 shallowing convolution.

NOTE: the only intermediate is the output of the average pooling.

In [None]:
ds_block_2_conv_mem = [28**2 * 128, 14**2 * 256, 28**2 * 128 / 8,
                       294912 * 4/3 / 8, 256 * 2 * 4,
                       589824 / 8, 256 * 2 * 4]
print_mem(ds_block_2_conv_mem)

In [None]:
ds_block_2_conv_cnt = [binarize_cnt(height=28, width=28, depth=128),
                       conv2d_shallowin_cnt(output_height=14, output_width=14, C_out=256, K_h=6),
                       add_cnt(height=14, width=14, depth=256)]
print("Instruction count estimates:")
print(f"  binarization:  {ds_block_2_conv_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 conv2d:    {ds_block_2_conv_cnt[1] * 1e-6: >1.3f} M")
print(f"  addition:      {ds_block_2_conv_cnt[2] * 1e-6: >1.3f} M")

### 3rd residual block

There will be 3 of these blocks, each identical except the last one, which needs to preload more weights.
This changes the memory footprint for the third layer, but not the computational cost.
This larger memory footprint is shown below.

NOTE: the binary conv2d is 3x3 with 1x1 strides, without biases. Since the number of input channels is $1 * 256$, the instruction count is modeled using deepin_deepout strategy with $1 * 32$ inputs.

In [None]:
res_block_3_mem = [14**2 * 256, 14**2 * 256, 14**2 * 256 / 8,
                   589824 / 8, 256 * 2 * 4,
                   131072, 512 * 2 * 4]
print_mem(res_block_3_mem)

In [None]:
res_block_3_cnt = [binarize_cnt(height=14, width=14, depth=256),
                   conv2d_deepin_cnt(output_height=14, output_width=14, C_in=32, C_out=256, K_h=3, K_w=3),
                   add_cnt(height=14, width=14, depth=256)]
print("Instruction count estimates:")
print(f"  binarization:  {res_block_3_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 conv2d:    {res_block_3_cnt[1] * 1e-6: >1.3f} M")
print(f"  addition:      {res_block_3_cnt[2] * 1e-6: >1.3f} M")

### 3rd downsampling block, pooling branch

This can be executed first, and its result offloaded to flash temporarily, then load back when doing the addition with the result of the conv branch.

NOTE: the only intermediate is the output of the average pooling.

NOTE: the conv2d kernel is 1x1, and deepin/deepout strategy can be used.

In [None]:
ds_block_3_pool_mem = [14**2 * 256, 7**2 * 512, 7**2 * 256,
                       131072, 512 * 2 * 4,
                       1179648 / 8, 512 * 2 * 4]
print_mem(ds_block_3_pool_mem)

In [None]:
ds_block_3_pool_cnt = [ave_pool_2x2_cnt(output_height=7, output_width=7, depth=256),
                       conv2d_deepin_cnt(output_height=7, output_width=7, C_in=256, C_out=512, K_h=1, K_w=1)]
print("Instruction count estimates:")
print(f"  Ave pooling:   {ds_block_3_pool_cnt[0] * 1e-6: >1.3f} M")
print(f"  1x1 conv2d:    {ds_block_3_pool_cnt[1] * 1e-6: >1.3f} M")

### 3rd downsampling block, binarized conv2d branch

NOTE: the binary conv2d is 3x3 with 2x2 strides, without biases.
Since the number of input channels is $1 ∗ 256$, the instruction count is modeled using deepin_deepout strategy with $1∗32$ inputs.

NOTE: the only intermediate is the output of the average pooling.

NOTE: too many weights in the next layer, so no pre-loading.

In [None]:
ds_block_3_conv_mem = [14**2 * 256, 7**2 * 512, 14**2 * 256 / 8,
                       1179648 / 8, 512 * 2 * 4]
print_mem(ds_block_3_conv_mem)

In [None]:
ds_block_3_conv_cnt = [binarize_cnt(height=14, width=14, depth=256),
                       conv2d_deepin_cnt(output_height=7, output_width=7, C_in=32, C_out=512, K_h=3, K_w=3),
                       add_cnt(height=7, width=7, depth=512)]
print("Instruction count estimates:")
print(f"  binarization:  {ds_block_3_conv_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 conv2d:    {ds_block_3_conv_cnt[1] * 1e-6: >1.3f} M")
print(f"  addition:      {ds_block_3_conv_cnt[2] * 1e-6: >1.3f} M")

### 4th residual block

There will be 3 of these blocks, each identical.
The last one will fuse the nonlinear activation that follows the last Add layer.

NOTE: the binary conv2d is 3x3 with 1x1 strides, without biases.
Since the number of input channels is $1 * 512$, the instruction count is modeled using deepin_deepout strategy with $2 * 32$ inputs.

NOTE: too many weights in the next layer, so no pre-loading.
The weight loading is estimated separately, since it cannot be parallelized

In [None]:
res_block_4_mem = [7**2 * 512, 7**2 * 512, 14**2 * 256 / 8,
                   2359296 / 8, 512 * 2 * 4]
print_mem(res_block_4_mem)

In [None]:
res_block_4_cnt = [binarize_cnt(height=7, width=7, depth=512),
                   conv2d_deepin_cnt(output_height=7, output_width=7, C_in=64, C_out=512, K_h=3, K_w=3),
                   add_cnt(height=7, width=7, depth=512)]
res_block_4_load = [mem_load_cnt(2359296 // 8)]
print("Instruction count estimates:")
print(f"  weight load:   {res_block_4_load[0] * 1e-6: >1.3f} M")
print(f"  binarization:  {res_block_4_cnt[0] * 1e-6: >1.3f} M")
print(f"  3x3 conv2d:    {res_block_4_cnt[1] * 1e-6: >1.3f} M")
print(f"  addition:      {res_block_4_cnt[2] * 1e-6: >1.3f} M")

### Final layers

We split the final fully connected layer into two parts, executing sequentially.
No preloading of weights, so loading penalty is added.

NOTE: The fully connected layer is approximated by two 512x512 layers.
This will probably be faster, even with 12 zero padding channels.

NOTE: The only intermediate is the output of the global average pooling

In [None]:
final_mem = [7**2 * 512, 512, 1000,
             500*512, 500 * 2 * 4]
print_mem(final_mem)

In [None]:
final_cnt = [global_ave_pool_cnt(7*7, 512),
             fc_cnt(512, 512),
             fc_cnt(512, 512)]
final_load = [mem_load_cnt(512*512),
              mem_load_cnt(512*512)]
print("Instruction count estimates:")
print(f"  global ave pooling: {final_cnt[0] * 1e-6: >1.3f} M")
print(f"  weight load:        {final_load[0] * 1e-6: >1.3f} M")
print(f"  FC first half:      {final_cnt[1] * 1e-6: >1.3f} M")
print(f"  weight load:        {final_load[1] * 1e-6: >1.3f} M")
print(f"  FC second half:     {final_cnt[2] * 1e-6: >1.3f} M")

# Speed estimates

In [None]:
cnt_list = [
    stem_cnt,
    res_block_1_cnt,
    res_block_1_cnt,
    res_block_1_cnt,
    res_block_1_cnt,
    ds_block_1_conv_cnt,
    ds_block_1_pool_cnt,
    res_block_2_cnt,
    res_block_2_cnt,
    res_block_2_cnt,
    ds_block_2_conv_cnt,
    ds_block_2_pool_cnt,
    res_block_3_cnt,
    res_block_3_cnt,
    res_block_3_cnt,
    ds_block_3_conv_cnt,
    ds_block_3_pool_cnt,
    res_block_4_cnt,
    res_block_4_cnt,
    res_block_4_cnt,
    final_load,
    final_cnt
]
cnt_sum = sum(d for c in cnt_list for d in c)

load_list = [res_block_4_load, res_block_4_load, res_block_4_load, final_load]
load_sum = sum(d for c in load_list for d in c)
cnt_list

In [None]:
NUM_CORES = 4
IBUFFER_FACTOR = 5/4
CLOCK_600, CLOCK_800 = 600e6, 800e6
NOMINAL_MACS = 1.81e9

total_cycles = 5 * (cnt_sum / NUM_CORES + load_sum) * IBUFFER_FACTOR
total_time_800 = total_cycles / CLOCK_800
print(f"Total non-prefetch instructions: {cnt_sum * 1e-6: 3.2f}M")
print(f"Total prefetch instructions:     {load_sum * 1e-6: 3.2f}M")
print(f"Total clock cycles:              {total_cycles * 1e-6: 3.2f}M")
print(f"Total time of execution @{CLOCK_800 * 1e-6:3.0f}MHz: {total_time_800 * 1e3: 3.2f}ms")
print(f"Total MAC/s:                     {NOMINAL_MACS / total_time_800 * 1e-9: 3.2f}GMAC/s")
print(f"Nominal binary MAC utilization:  {(NOMINAL_MACS / total_time_800) / (CLOCK_800*256): 3.2%}")

In [None]:
print(f"Proportion of input layer conv instructions: {cnt_list[0][0] / cnt_sum:.2%}")
print(f"Proportion of binary instructions with suboptimal input channels: "
      f"{sum(c[1] for c in (cnt_list[1:6] + cnt_list[7:11])) / cnt_sum:.2%}")

# Energy estimates

In [None]:
WATTAGE = 100e-3

print(f"Energy efficiency of the chip:  {2 * NOMINAL_MACS / total_time_800 * 1e-9 / WATTAGE: 3.2f}GOp/s/W")

# Memory estimates

In [None]:
mem_list = [
    stem_mem,
    res_block_1_mem,
    res_block_1_mem,
    res_block_1_mem,
    res_block_1_mem,
    ds_block_1_pool_mem,
    ds_block_1_conv_mem,
    res_block_2_mem,
    res_block_2_mem,
    res_block_2_mem,
    ds_block_2_pool_mem,
    ds_block_2_conv_mem,
    res_block_3_mem,
    res_block_3_mem,
    res_block_3_mem,
    ds_block_3_pool_mem,
    ds_block_3_conv_mem,
    res_block_4_mem,
    res_block_4_mem,
    res_block_4_mem,
    final_mem,
    final_mem
]

mem_sum = sum(d for c in mem_list for d in c[3:5])
print(f"Total memory for weights and parameters: {mem_sum / 2**20 :.2f}MB")