In [1]:
import timm
import tome
from torchinfo import summary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Model
model_name = "vit_base_patch16_224"
model = timm.create_model(model_name, pretrained=True)

In [3]:
# GPU setting
device = "cuda:0"
runs = 50
batch_size = 256  # Lower this if you don't have that much memory
input_size = model.default_cfg["input_size"]

In [4]:
# Baseline benchmark
baseline_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)

  x = F.scaled_dot_product_attention(
Benchmarking: 100%|██████████| 50/50 [01:13<00:00,  1.47s/it]


Throughput: 149.04 im/s


In [5]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                   Output Shape              Param #
VisionTransformer                        [256, 1000]               152,064
├─PatchEmbed: 1-1                        [256, 196, 768]           --
│    └─Conv2d: 2-1                       [256, 768, 14, 14]        590,592
│    └─Identity: 2-2                     [256, 196, 768]           --
├─Dropout: 1-2                           [256, 197, 768]           --
├─Identity: 1-3                          [256, 197, 768]           --
├─Identity: 1-4                          [256, 197, 768]           --
├─Sequential: 1-5                        [256, 197, 768]           --
│    └─Block: 2-3                        [256, 197, 768]           --
│    │    └─LayerNorm: 3-1               [256, 197, 768]           1,536
│    │    └─Attention: 3-2               [256, 197, 768]           2,362,368
│    │    └─Identity: 3-3                [256, 197, 768]           --
│    │    └─Identity: 3-4                [256, 197, 768]         

In [6]:
print("\n Apply ToMe \n")
tome.patch.timm(model)
# ToMe with r=16
model.r = 16
tome_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)
print(f"Throughput improvement: {tome_throughput / baseline_throughput:.2f}x")


 Apply ToMe 



Benchmarking: 100%|██████████| 50/50 [00:51<00:00,  1.04s/it]


Throughput: 247.85 im/s
Throughput improvement: 1.66x


In [7]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                   Output Shape              Param #
ToMeVisionTransformer                    [256, 1000]               152,064
├─PatchEmbed: 1-1                        [256, 196, 768]           --
│    └─Conv2d: 2-1                       [256, 768, 14, 14]        590,592
│    └─Identity: 2-2                     [256, 196, 768]           --
├─Dropout: 1-2                           [256, 197, 768]           --
├─Identity: 1-3                          [256, 197, 768]           --
├─Identity: 1-4                          [256, 197, 768]           --
├─Sequential: 1-5                        [256, 11, 768]            --
│    └─ToMeBlock: 2-3                    [256, 181, 768]           --
│    │    └─LayerNorm: 3-1               [256, 197, 768]           1,536
│    │    └─ToMeAttention: 3-2           [256, 197, 768]           2,362,368
│    │    └─Identity: 3-3                [256, 197, 768]           --
│    │    └─LayerNorm: 3-4               [256, 181, 768]         

In [8]:
print("\n Apply ToMe with decreasing schedule \n")
model.r = (16, -1.0)
tome_decr_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)
print(f"Throughput improvement: {tome_decr_throughput / baseline_throughput:.2f}x")


 Apply ToMe with decreasing schedule 



Benchmarking: 100%|██████████| 50/50 [00:35<00:00,  1.43it/s]


Throughput: 364.03 im/s
Throughput improvement: 2.44x


In [9]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                   Output Shape              Param #
ToMeVisionTransformer                    [256, 1000]               152,064
├─PatchEmbed: 1-1                        [256, 196, 768]           --
│    └─Conv2d: 2-1                       [256, 768, 14, 14]        590,592
│    └─Identity: 2-2                     [256, 196, 768]           --
├─Dropout: 1-2                           [256, 197, 768]           --
├─Identity: 1-3                          [256, 197, 768]           --
├─Identity: 1-4                          [256, 197, 768]           --
├─Sequential: 1-5                        [256, 10, 768]            --
│    └─ToMeBlock: 2-3                    [256, 165, 768]           --
│    │    └─LayerNorm: 3-1               [256, 197, 768]           1,536
│    │    └─ToMeAttention: 3-2           [256, 197, 768]           2,362,368
│    │    └─Identity: 3-3                [256, 197, 768]           --
│    │    └─LayerNorm: 3-4               [256, 165, 768]         

In [10]:
# Load mobileNet
model_name = "tf_mobilenetv3_large_075"
model = timm.create_model(model_name, pretrained=True)
device = "cuda:0"
runs = 50
batch_size = 256  # Lower this if you don't have that much memory
input_size = model.default_cfg["input_size"]
print(model_name)
baseline_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)

tf_mobilenetv3_large_075


Benchmarking: 100%|██████████| 50/50 [00:08<00:00,  5.90it/s]


Throughput: 1418.41 im/s


In [11]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                        Output Shape              Param #
MobileNetV3                                   [256, 1000]               --
├─Conv2dSame: 1-1                             [256, 16, 112, 112]       432
├─BatchNormAct2d: 1-2                         [256, 16, 112, 112]       32
│    └─Identity: 2-1                          [256, 16, 112, 112]       --
│    └─Hardswish: 2-2                         [256, 16, 112, 112]       --
├─Sequential: 1-3                             [256, 720, 7, 7]          --
│    └─Sequential: 2-3                        [256, 16, 112, 112]       --
│    │    └─DepthwiseSeparableConv: 3-1       [256, 16, 112, 112]       464
│    └─Sequential: 2-4                        [256, 24, 56, 56]         --
│    │    └─InvertedResidual: 3-2             [256, 24, 56, 56]         3,440
│    │    └─InvertedResidual: 3-3             [256, 24, 56, 56]         4,440
│    └─Sequential: 2-5                        [256, 32, 28, 28]         --
│    │    └─

In [12]:
# Load mobileNet
model_name = "mobilenetv4_hybrid_medium_075"
model = timm.create_model(model_name, pretrained=False)
device = "cuda:0"
runs = 50
batch_size = 256  # Lower this if you don't have that much memory
input_size = model.default_cfg["input_size"]
print(model_name)
baseline_throughput = tome.utils.benchmark(
    model,
    device=device,
    verbose=True,
    runs=runs,
    batch_size=batch_size,
    input_size=input_size
)

mobilenetv4_hybrid_medium_075


Benchmarking: 100%|██████████| 50/50 [00:10<00:00,  4.91it/s]


Throughput: 1206.70 im/s


In [13]:
summary(model, input_size=(batch_size, 3, 224, 224), verbose=0)

Layer (type:depth-idx)                             Output Shape              Param #
MobileNetV3                                        [256, 1000]               --
├─Conv2d: 1-1                                      [256, 32, 112, 112]       864
├─BatchNormAct2d: 1-2                              [256, 32, 112, 112]       64
│    └─Identity: 2-1                               [256, 32, 112, 112]       --
│    └─ReLU: 2-2                                   [256, 32, 112, 112]       --
├─Sequential: 1-3                                  [256, 720, 7, 7]          --
│    └─Sequential: 2-3                             [256, 40, 56, 56]         --
│    │    └─EdgeResidual: 3-1                      [256, 40, 56, 56]         42,320
│    └─Sequential: 2-4                             [256, 64, 28, 28]         --
│    │    └─UniversalInvertedResidual: 3-2         [256, 64, 28, 28]         21,912
│    │    └─UniversalInvertedResidual: 3-3         [256, 64, 28, 28]         18,944
│    └─Sequential: 2-5

In [14]:
for i in timm.list_models():
    print(i)

bat_resnext26ts
beit_base_patch16_224
beit_base_patch16_384
beit_large_patch16_224
beit_large_patch16_384
beit_large_patch16_512
beitv2_base_patch16_224
beitv2_large_patch16_224
botnet26t_256
botnet50ts_256
caformer_b36
caformer_m36
caformer_s18
caformer_s36
cait_m36_384
cait_m48_448
cait_s24_224
cait_s24_384
cait_s36_384
cait_xs24_384
cait_xxs24_224
cait_xxs24_384
cait_xxs36_224
cait_xxs36_384
coat_lite_medium
coat_lite_medium_384
coat_lite_mini
coat_lite_small
coat_lite_tiny
coat_mini
coat_small
coat_tiny
coatnet_0_224
coatnet_0_rw_224
coatnet_1_224
coatnet_1_rw_224
coatnet_2_224
coatnet_2_rw_224
coatnet_3_224
coatnet_3_rw_224
coatnet_4_224
coatnet_5_224
coatnet_bn_0_rw_224
coatnet_nano_cc_224
coatnet_nano_rw_224
coatnet_pico_rw_224
coatnet_rmlp_0_rw_224
coatnet_rmlp_1_rw2_224
coatnet_rmlp_1_rw_224
coatnet_rmlp_2_rw_224
coatnet_rmlp_2_rw_384
coatnet_rmlp_3_rw_224
coatnet_rmlp_nano_rw_224
coatnext_nano_rw_224
convformer_b36
convformer_m36
convformer_s18
convformer_s36
convit_base
conv

In [21]:
import timm
from torchinfo import summary
import torch

class ModelConfig:
    def __init__(self, exp_name: str, model_type: str, model_name: str, runs: int, epochs: int, batch_size: int, learning_rate: float):
        self.exp_name = exp_name
        self.model_type = model_type
        self.model_name = model_name
        self.runs = runs
        self.epochs = epochs
        self.batch_size = batch_size
        self.learning_rate = learning_rate
cfg = ModelConfig(
        exp_name = "augreg",
        model_type = "vit",
        model_name = "vit_small_patch32_224.augreg_in21k_ft_in1k",
        runs = 50,
        epochs = 300,
        batch_size = 64,
        learning_rate = 1e-3
    )

model = timm.create_model(cfg.model_name, pretrained=True, num_classes=10)
# sample_input = torch.randn(1, 3, 224, 224)  # 單個樣本的輸入
# output = model(sample_input)
# print(output.shape)
summary(model, input_size=(1, 3, 224, 224))


Layer (type:depth-idx)                   Output Shape              Param #
VisionTransformer                        [1, 10]                   19,584
├─PatchEmbed: 1-1                        [1, 49, 384]              --
│    └─Conv2d: 2-1                       [1, 384, 7, 7]            1,180,032
│    └─Identity: 2-2                     [1, 49, 384]              --
├─Dropout: 1-2                           [1, 50, 384]              --
├─Identity: 1-3                          [1, 50, 384]              --
├─Identity: 1-4                          [1, 50, 384]              --
├─Sequential: 1-5                        [1, 50, 384]              --
│    └─Block: 2-3                        [1, 50, 384]              --
│    │    └─LayerNorm: 3-1               [1, 50, 384]              768
│    │    └─Attention: 3-2               [1, 50, 384]              591,360
│    │    └─Identity: 3-3                [1, 50, 384]              --
│    │    └─Identity: 3-4                [1, 50, 384]              -

In [24]:
cfg = ModelConfig(
        exp_name = "augreg",
        model_type = "vit",
        model_name = "deit_tiny_distilled_patch16_224.fb_in1k",
        runs = 50,
        epochs = 300,
        batch_size = 64,
        learning_rate = 1e-3
    )

model = timm.create_model(cfg.model_name, pretrained=True, num_classes=10)
# sample_input = torch.randn(1, 3, 224, 224)  # 單個樣本的輸入
# output = model(sample_input)
# print(output.shape)
summary(model, input_size=(1, 3, 224, 224))


Layer (type:depth-idx)                   Output Shape              Param #
VisionTransformerDistilled               [1, 10]                   38,400
├─PatchEmbed: 1-1                        [1, 196, 192]             --
│    └─Conv2d: 2-1                       [1, 192, 14, 14]          147,648
│    └─Identity: 2-2                     [1, 196, 192]             --
├─Dropout: 1-2                           [1, 198, 192]             --
├─Identity: 1-3                          [1, 198, 192]             --
├─Identity: 1-4                          [1, 198, 192]             --
├─Sequential: 1-5                        [1, 198, 192]             --
│    └─Block: 2-3                        [1, 198, 192]             --
│    │    └─LayerNorm: 3-1               [1, 198, 192]             384
│    │    └─Attention: 3-2               [1, 198, 192]             148,224
│    │    └─Identity: 3-3                [1, 198, 192]             --
│    │    └─Identity: 3-4                [1, 198, 192]             --
