In [1]:
import torch
import torchvision.models as models
import torchprofile
import time
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
from torchsummary import summary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'shufflenet_v2_x1_0', pretrained=True)
model.fc = nn.Linear(in_features=1024, out_features=2)
model.to(device)
summary(model, (3, 256, 256), batch_size=64)

Using cache found in /home/taha/.cache/torch/hub/pytorch_vision_v0.10.0


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [64, 24, 128, 128]             648
       BatchNorm2d-2         [64, 24, 128, 128]              48
              ReLU-3         [64, 24, 128, 128]               0
         MaxPool2d-4           [64, 24, 64, 64]               0
            Conv2d-5           [64, 24, 32, 32]             216
       BatchNorm2d-6           [64, 24, 32, 32]              48
            Conv2d-7           [64, 58, 32, 32]           1,392
       BatchNorm2d-8           [64, 58, 32, 32]             116
              ReLU-9           [64, 58, 32, 32]               0
           Conv2d-10           [64, 58, 64, 64]           1,392
      BatchNorm2d-11           [64, 58, 64, 64]             116
             ReLU-12           [64, 58, 64, 64]               0
           Conv2d-13           [64, 58, 32, 32]             522
      BatchNorm2d-14           [64, 58,

In [4]:
input_tensor = torch.randn(1, 3, 256, 256).to(device)

https://github.com/zhijian-liu/torchprofile

In [5]:
# Warm up the GPU
with torch.no_grad():
    for _ in range(10):
        _ = model(input_tensor)

# Measure FLOPs
with torch.no_grad():
    macs = torchprofile.profile_macs(model, args=(input_tensor,))
    flops = 2 * macs  # Convert MACs to FLOPs

# Measure inference time
with torch.no_grad():
    start_time = time.time()
    for _ in range(100):  # Run multiple inferences
        _ = model(input_tensor)
    end_time = time.time()

# Calculate elapsed time per inference and throughput
elapsed_time = (end_time - start_time) / 100  # Average inference time per run
tops = (flops / elapsed_time) / 1e12  # Throughput in TOPs

In [6]:
print(f"FLOPs: {flops / 1e9:.2f} GFLOPs, "
        f"Inference Time: {elapsed_time * 1000:.2f} ms, Throughput: {tops:.2f} TOPs")

FLOPs: 0.38 GFLOPs, Inference Time: 10.93 ms, Throughput: 0.03 TOPs
