In [1]:
import ofa.model_zoo as ofa
import numpy as np
import torch
import os

OFA_PATH = 'ofa_nets/ofa_mbv3_d234_e346_k357_w1.0'

Define a primitive to load the OFA net:

In [2]:
def download_weights():
    url_base = "https://raw.githubusercontent.com/han-cai/files/master/ofa/ofa_nets/"
    gdown.download(url_base, OFA_PATH, quiet=False)

def ofa_mobilenet( weights_path: str | None ):
    """ Loads the MobileNetV3 model class and the corresponding weights """
    
    network = ofa.OFAMobileNetV3(
        dropout_rate=0,
        width_mult=1.0,
        ks_list=[3, 5, 7],
        expand_ratio_list=[3, 4, 6],
        depth_list=[2, 3, 4],
    )

    if not os.path.isfile(OFA_PATH):
        download_weights()
    init_weights = torch.load(OFA_PATH, map_location="cpu")["state_dict"]
    network.load_state_dict(init_weights)
    return network

## 1. Basic Class exploration:

Load the OFA network and try to understand the different sampling strategies. Some checks:
- **First Layer doesn't change** when changing archs.
- **First Block doesn't change** when changing archs.
- The rest of the **blocks (max: 20)** change depending on the parameters.

In [3]:
ofa_network = ofa_mobilenet( OFA_PATH )

Check how the models change ( compare minimal vs maximal network ):

In [4]:
ofa_network.set_max_net()
max_network = ofa_network.get_active_subnet(preserve_weight=True)
print( "Number of Blocks (MAX):", len(max_network.blocks) )

ofa_network.set_active_subnet(ks=3, e=3, d=2)
min_network = ofa_network.get_active_subnet(preserve_weight=True)
print( "Number of Blocks (MIN):", len(min_network.blocks) )

Number of Blocks (MAX): 21
Number of Blocks (MIN): 11


Compare the blocks:

In [5]:
min_network.config['blocks'][1]

{'name': 'ResidualBlock',
 'conv': {'name': 'MBConvLayer',
  'in_channels': 16,
  'out_channels': 24,
  'kernel_size': 3,
  'stride': 2,
  'expand_ratio': 3,
  'mid_channels': 48,
  'act_func': 'relu',
  'use_se': False,
  'groups': None},
 'shortcut': None}

In [6]:
max_network.config['blocks'][1]

{'name': 'ResidualBlock',
 'conv': {'name': 'MBConvLayer',
  'in_channels': 16,
  'out_channels': 24,
  'kernel_size': 7,
  'stride': 2,
  'expand_ratio': 6,
  'mid_channels': 96,
  'act_func': 'relu',
  'use_se': False,
  'groups': None},
 'shortcut': None}

Check **how an "expansion" inside** the OFA could look.
> Here we used a "Wider" transformation from one to the other.

In [7]:
ofa_network.set_active_subnet(ks=3, e=[3, 3, 3, 3, 3], d=[1, 1, 1, 1, 1])
base_network = ofa_network.get_active_subnet(preserve_weight=True)

ofa_network.set_active_subnet(ks=3, e=[4, 3, 3, 3, 3], d=[1, 1, 1, 1, 1])
expanded_network = ofa_network.get_active_subnet(preserve_weight=True)

They differ (dimension-wise), only on the **block1**:

In [8]:
base_network.blocks[1], expanded_network.blocks[1]

(ResidualBlock(
   (conv): MBConvLayer(
     (inverted_bottleneck): Sequential(
       (conv): Conv2d(16, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
       (bn): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (act): ReLU(inplace=True)
     )
     (depth_conv): Sequential(
       (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
       (bn): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (act): ReLU(inplace=True)
     )
     (point_linear): Sequential(
       (conv): Conv2d(48, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
       (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     )
   )
 ),
 ResidualBlock(
   (conv): MBConvLayer(
     (inverted_bottleneck): Sequential(
       (conv): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
       (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True,

Checking the weights on this block:

In [9]:
base_block = base_network.blocks[1]
expanded_block = expanded_network.blocks[1]

print( base_block.conv.inverted_bottleneck )

Sequential(
  (conv): Conv2d(16, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act): ReLU(inplace=True)
)


Starting with the first layer. We check the **shapes and the matching of these weights** values.

In [33]:
base_layer = base_block
expanded_layer = expanded_block

for (bname, bparam), (ename, eparam) in zip( base_layer.named_parameters(), expanded_layer.named_parameters() ):
    print( "Base Layer:", bname, "| Size:", bparam.shape)
    # print(" -", bparam[:5, ...].reshape(-1) )

    shared_axis = [ bparam.shape[i] != eparam.shape[i] for i in range( len(bparam.shape) ) ]
    shared_axis = shared_axis.index(True) if True in shared_axis else 0
        
    if (shared_axis != 0):
        shared_part = eparam[ :, :bparam.shape[shared_axis], ...]
    else:
        shared_part = eparam[ :bparam.shape[0], ...]
        
    
    print( " - Expanded:", ename, "| Size:", eparam.shape)
    print( " - Match?:", (shared_part == bparam).all() )
    print()

Base Layer: conv.inverted_bottleneck.conv.weight | Size: torch.Size([48, 16, 1, 1])
 - Expanded: conv.inverted_bottleneck.conv.weight | Size: torch.Size([64, 16, 1, 1])
 - Match?: tensor(True)

Base Layer: conv.inverted_bottleneck.bn.weight | Size: torch.Size([48])
 - Expanded: conv.inverted_bottleneck.bn.weight | Size: torch.Size([64])
 - Match?: tensor(True)

Base Layer: conv.inverted_bottleneck.bn.bias | Size: torch.Size([48])
 - Expanded: conv.inverted_bottleneck.bn.bias | Size: torch.Size([64])
 - Match?: tensor(True)

Base Layer: conv.depth_conv.conv.weight | Size: torch.Size([48, 1, 3, 3])
 - Expanded: conv.depth_conv.conv.weight | Size: torch.Size([64, 1, 3, 3])
 - Match?: tensor(True)

Base Layer: conv.depth_conv.bn.weight | Size: torch.Size([48])
 - Expanded: conv.depth_conv.bn.weight | Size: torch.Size([64])
 - Match?: tensor(True)

Base Layer: conv.depth_conv.bn.bias | Size: torch.Size([48])
 - Expanded: conv.depth_conv.bn.bias | Size: torch.Size([64])
 - Match?: tensor(Tru

## 2. Understanding the model layers:

Interesting to understand all the "types" of blocks we have in the network.

Basically, there a few blocks that are present with the same architecture in all the models:

1. **First Conv:**

```
ConvLayer(
  (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act): Hswish()
)
```

2. **Final Expand Layer:**

```
ConvLayer(
  (conv): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act): Hswish() <CHECK>
)
```

3. **Feature Mix Layer**:

```
ConvLayer(
  (conv): Conv2d(960, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (act): Hswish()
)
```

4. **Classifier:**

```
(classifier): LinearLayer(
    (linear): Linear(in_features=1280, out_features=1000, bias=True)
  )
```

Then, the intermediate layers `blocks` are basically constructs of `ResidualBlocks` with and without `shortcuts.` With a shape similar to:

```
ResidualBlock(
    (conv): Sequential(...)
    (shortcut): <optional>
)
```

Here are the three main blocks for `conv`:

1. **InvertedBottleNeck**: this one is optional and always goes at the beginning.
```
(inverted_bottleneck): Sequential(
          (conv): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): ReLU(inplace=True)
        )
```

2. **DepthConv**: is always there in a similar form, it may contain a last layer for `SqueezeExcitation`.
```
(depth_conv): Sequential(
          (conv): Conv2d(144, 144, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), groups=144, bias=False)
          (bn): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): ReLU(inplace=True)
          (se): SE(channel=144, reduction=4) <OPTIONAL>
        )
```

4. **PointLinear**: Always included with the same architecture

```
(point_linear): Sequential(
          (conv): Conv2d(96, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
```

In [54]:
for name, child_mod in ( max_network.final_expand_layer.named_children() ):
    # print(name)
    print(child_mod)

Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
Hswish()


In [69]:
max_network.blocks[5].conv.depth_conv.se

SE(channel=144, reduction=4)