In [2]:
import timm
import tome
from torchinfo import summary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Model
model_name = "vit_base_patch16_224"
model = timm.create_model(model_name, pretrained=True)

In [3]:
# GPU setting
device = "cuda:0"
runs = 50
batch_size = 256  # Lower this if you don't have that much memory
input_size = model.default_cfg["input_size"]

In [4]:
# Baseline benchmark
baseline_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)

Benchmarking: 100%|██████████| 50/50 [01:05<00:00,  1.31s/it]


Throughput: 178.13 im/s


In [5]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                   Output Shape              Param #
VisionTransformer                        [256, 1000]               152,064
├─PatchEmbed: 1-1                        [256, 196, 768]           --
│    └─Conv2d: 2-1                       [256, 768, 14, 14]        590,592
│    └─Identity: 2-2                     [256, 196, 768]           --
├─Dropout: 1-2                           [256, 197, 768]           --
├─Sequential: 1-3                        [256, 197, 768]           --
│    └─Block: 2-3                        [256, 197, 768]           --
│    │    └─LayerNorm: 3-1               [256, 197, 768]           1,536
│    │    └─Attention: 3-2               [256, 197, 768]           2,362,368
│    │    └─Identity: 3-3                [256, 197, 768]           --
│    │    └─LayerNorm: 3-4               [256, 197, 768]           1,536
│    │    └─Mlp: 3-5                     [256, 197, 768]           4,722,432
│    │    └─Identity: 3-6                [256, 197, 768

In [6]:
print("\n Apply ToMe \n")
tome.patch.timm(model)
# ToMe with r=16
model.r = 16
tome_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)
print(f"Throughput improvement: {tome_throughput / baseline_throughput:.2f}x")


 Apply ToMe 



Benchmarking: 100%|██████████| 50/50 [00:39<00:00,  1.27it/s]


Throughput: 320.44 im/s
Throughput improvement: 1.80x


In [7]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                   Output Shape              Param #
ToMeVisionTransformer                    [256, 1000]               152,064
├─PatchEmbed: 1-1                        [256, 196, 768]           --
│    └─Conv2d: 2-1                       [256, 768, 14, 14]        590,592
│    └─Identity: 2-2                     [256, 196, 768]           --
├─Dropout: 1-2                           [256, 197, 768]           --
├─Sequential: 1-3                        [256, 11, 768]            --
│    └─ToMeBlock: 2-3                    [256, 181, 768]           --
│    │    └─LayerNorm: 3-1               [256, 197, 768]           1,536
│    │    └─ToMeAttention: 3-2           [256, 197, 768]           2,362,368
│    │    └─Identity: 3-3                [256, 197, 768]           --
│    │    └─LayerNorm: 3-4               [256, 181, 768]           1,536
│    │    └─Mlp: 3-5                     [256, 181, 768]           4,722,432
│    │    └─Identity: 3-6                [256, 181, 768

In [8]:
print("\n Apply ToMe with decreasing schedule \n")
model.r = (16, -1.0)
tome_decr_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)
print(f"Throughput improvement: {tome_decr_throughput / baseline_throughput:.2f}x")


 Apply ToMe with decreasing schedule 



Benchmarking: 100%|██████████| 50/50 [00:26<00:00,  1.87it/s]


Throughput: 469.72 im/s
Throughput improvement: 2.64x


In [9]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                   Output Shape              Param #
ToMeVisionTransformer                    [256, 1000]               152,064
├─PatchEmbed: 1-1                        [256, 196, 768]           --
│    └─Conv2d: 2-1                       [256, 768, 14, 14]        590,592
│    └─Identity: 2-2                     [256, 196, 768]           --
├─Dropout: 1-2                           [256, 197, 768]           --
├─Sequential: 1-3                        [256, 10, 768]            --
│    └─ToMeBlock: 2-3                    [256, 165, 768]           --
│    │    └─LayerNorm: 3-1               [256, 197, 768]           1,536
│    │    └─ToMeAttention: 3-2           [256, 197, 768]           2,362,368
│    │    └─Identity: 3-3                [256, 197, 768]           --
│    │    └─LayerNorm: 3-4               [256, 165, 768]           1,536
│    │    └─Mlp: 3-5                     [256, 165, 768]           4,722,432
│    │    └─Identity: 3-6                [256, 165, 768

In [10]:
# Load mobileNet
model_name = "tf_mobilenetv3_large_075"
model = timm.create_model(model_name, pretrained=True)
device = "cuda:0"
runs = 50
batch_size = 256  # Lower this if you don't have that much memory
input_size = model.default_cfg["input_size"]
print(model_name)
baseline_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)

tf_mobilenetv3_large_075


Benchmarking: 100%|██████████| 50/50 [00:06<00:00,  7.35it/s]


Throughput: 1869.02 im/s


In [11]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                        Output Shape              Param #
MobileNetV3                                   [256, 1000]               --
├─Conv2dSame: 1-1                             [256, 16, 112, 112]       432
├─BatchNorm2d: 1-2                            [256, 16, 112, 112]       32
├─Hardswish: 1-3                              [256, 16, 112, 112]       --
├─Sequential: 1-4                             [256, 720, 7, 7]          --
│    └─Sequential: 2-1                        [256, 16, 112, 112]       --
│    │    └─DepthwiseSeparableConv: 3-1       [256, 16, 112, 112]       464
│    └─Sequential: 2-2                        [256, 24, 56, 56]         --
│    │    └─InvertedResidual: 3-2             [256, 24, 56, 56]         3,440
│    │    └─InvertedResidual: 3-3             [256, 24, 56, 56]         4,440
│    └─Sequential: 2-3                        [256, 32, 28, 28]         --
│    │    └─InvertedResidual: 3-4             [256, 32, 28, 28]         9,736
│    │   

In [4]:
# Load mobileNet
model_name = "mobilenetv4_hybrid_medium_075"
model = timm.create_model(model_name, pretrained=False)
device = "cuda:0"
runs = 50
batch_size = 256  # Lower this if you don't have that much memory
input_size = model.default_cfg["input_size"]
print(model_name)
baseline_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)

mobilenetv4_hybrid_medium_075


  o = F.scaled_dot_product_attention(
Benchmarking: 100%|██████████| 50/50 [00:07<00:00,  6.34it/s]


Throughput: 1590.61 im/s


In [5]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                             Output Shape              Param #
MobileNetV3                                        [256, 1000]               --
├─Conv2d: 1-1                                      [256, 32, 112, 112]       864
├─BatchNormAct2d: 1-2                              [256, 32, 112, 112]       64
│    └─Identity: 2-1                               [256, 32, 112, 112]       --
│    └─ReLU: 2-2                                   [256, 32, 112, 112]       --
├─Sequential: 1-3                                  [256, 720, 7, 7]          --
│    └─Sequential: 2-3                             [256, 40, 56, 56]         --
│    │    └─EdgeResidual: 3-1                      [256, 40, 56, 56]         42,320
│    └─Sequential: 2-4                             [256, 64, 28, 28]         --
│    │    └─UniversalInvertedResidual: 3-2         [256, 64, 28, 28]         21,912
│    │    └─UniversalInvertedResidual: 3-3         [256, 64, 28, 28]         18,944
│    └─Sequential: 2-5

In [16]:
for i in timm.list_models():
    print(i)

adv_inception_v3
bat_resnext26ts
botnet26t_256
botnet50ts_256
cait_m36_384
cait_m48_448
cait_s24_224
cait_s24_384
cait_s36_384
cait_xs24_384
cait_xxs24_224
cait_xxs24_384
cait_xxs36_224
cait_xxs36_384
coat_lite_mini
coat_lite_small
coat_lite_tiny
coat_mini
coat_tiny
convit_base
convit_small
convit_tiny
cspdarknet53
cspdarknet53_iabn
cspresnet50
cspresnet50d
cspresnet50w
cspresnext50
cspresnext50_iabn
darknet53
deit_base_distilled_patch16_224
deit_base_distilled_patch16_384
deit_base_patch16_224
deit_base_patch16_384
deit_small_distilled_patch16_224
deit_small_patch16_224
deit_tiny_distilled_patch16_224
deit_tiny_patch16_224
densenet121
densenet121d
densenet161
densenet169
densenet201
densenet264
densenet264d_iabn
densenetblur121d
dla34
dla46_c
dla46x_c
dla60
dla60_res2net
dla60_res2next
dla60x
dla60x_c
dla102
dla102x
dla102x2
dla169
dm_nfnet_f0
dm_nfnet_f1
dm_nfnet_f2
dm_nfnet_f3
dm_nfnet_f4
dm_nfnet_f5
dm_nfnet_f6
dpn68
dpn68b
dpn92
dpn98
dpn107
dpn131
eca_botnext26ts_256
eca_efficien