In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import optim

In [2]:
import sys
sys.path.insert(1, "../")

In [3]:
from train import *
from data_preprocessing import *
from Models.yolov8cls_path import *

In [4]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu') 

In [5]:
data_path = '../../dummy_datasets/'
norms_path = os.path.join(data_path, 'norms.json')

In [6]:
means = get_means(path=norms_path, train_loader=None)
stds = get_stds(path=norms_path, train_loader=None)

Means are: [0.4405549168586731, 0.4407285749912262, 0.4381718039512634]
stds are: [0.25142669677734375, 0.25270089507102966, 0.25131651759147644]


In [7]:
transformations = transforms.Compose([transforms.ToTensor(),
                                      transforms.Resize((224, 224)),
                                      transforms.Normalize(mean=means, std=stds)])

In [8]:
train_dataset = ImageNetSubset(path=data_path, train=True, transform=transformations, half=False, show=False)
val_dataset = ImageNetSubset(path=data_path, train=False, transform=transformations, half=False, show=False)

YOLOv8n</br>
Mine: 3.290265 GFLOPs ; On the website: 4.3 GFLOPs</br>
YOLOv8s</br>
Mine: 12.449996 GFLOPs ; On the website: 13.5 GFLOPs</br>
YOLOv8m</br>
Mine: 41.640755 GFLOPs ; On the website: 42.7 GFLOPs

In [9]:
model = Model(num_classes=1000, 
              residual_connection=True, 
              CSP=True, 
              add_hidden=True,
              classifyV8=True,
              bottleneck=1.0, 
              variant='s', 
              device=device, 
              dtype=torch.float32)

In [10]:
img = torch.rand(1,3,640,640)
_ = transforms.Compose([transforms.Normalize(mean=[0.5, 0.5, 0.5],
                        std=[0.5, 0.5, 0.5])])
img = _(img)

In [11]:
from torch.profiler import profile, ProfilerActivity

In [12]:
with profile(activities=[ProfilerActivity.CPU],
                                    schedule=torch.profiler.schedule(wait=1, warmup=2, active=10, repeat=1),
                                    on_trace_ready=torch.profiler.tensorboard_trace_handler('../log/darknet19/inference'),
                                    record_shapes=True,
                                    profile_memory=True,
                                    with_flops=True,
                                    with_modules=True,
                                    with_stack=True) as prof:
    for i in range(13):
        out = model(img)
        prof.step()
print(prof.key_averages().table(row_limit=-1))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  Total MFLOPs  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                   ProfilerStep*         0.74%      10.425ms       100.00%        1.414s     141.414ms     210.21 Mb      -1.90 Gb            10            --  
                      Conv block         5.81%      82.200ms        93.41%        1.321s       5.080ms       1.93 Gb           0 b           260            --  
                    aten::conv2d         0.16%       2.284ms        61.98%     876.429ms       3.371ms     660.16 Mb           0 b           260    124452.864  
               aten::convolution  

In [13]:
def de_parallel(model):
    """De-parallelize a model: returns single-GPU model if model is of type DP or DDP."""
    return model.module if is_parallel(model) else model

def is_parallel(model):
    """Returns True if model is of type DP or DDP."""
    return isinstance(model, (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel))

In [28]:
def get_flops(model, imgsz=640):
    """Return a YOLO model's FLOPs."""
    if not thop:
        print('here')
        return 0.0  # if not installed return 0.0 GFLOPs

    model = de_parallel(model)
    p = next(model.parameters())
    if not isinstance(imgsz, list):
        imgsz = [imgsz, imgsz]  # expand if int/float
    try:
        # Use stride size for input tensor
        stride = max(int(model.stride.max()), 32) if hasattr(model, "stride") else 32  # max stride
        im = torch.empty((1, p.shape[1], stride, stride), device=p.device).half()  # input image in BCHW format
        flops = thop.profile(deepcopy(model), inputs=[im], verbose=False)[0] / 1e9 * 2  # stride GFLOPs
        return flops * imgsz[0] / stride * imgsz[1] / stride  # imgsz GFLOPs
    except Exception:
        # Use actual image size for input tensor (i.e. required for RTDETR models)
        im = torch.empty((1, p.shape[1], *imgsz), device=p.device)  # input image in BCHW format
        return thop.profile(deepcopy(model), inputs=[im], verbose=False)[0] / 1e9 * 2  # imgsz GFLOPs


def get_flops_with_torch_profiler(model, imgsz=640):
    """Compute model FLOPs (thop package alternative, but 2-10x slower unfortunately)."""
    if not TORCH_2_0:  # torch profiler implemented in torch>=2.0
        return 0.0
    model = de_parallel(model)
    p = next(model.parameters())
    if not isinstance(imgsz, list):
        imgsz = [imgsz, imgsz]  # expand if int/float
    try:
        # Use stride size for input tensor
        stride = (max(int(model.stride.max()), 32) if hasattr(model, "stride") else 32) * 2  # max stride
        im = torch.empty((1, p.shape[1], stride, stride), device=p.device)  # input image in BCHW format
        with torch.profiler.profile(with_flops=True) as prof:
            model(im)
        flops = sum(x.flops for x in prof.key_averages()) / 1e9
        flops = flops * imgsz[0] / stride * imgsz[1] / stride  # 640x640 GFLOPs
    except Exception:
        # Use actual image size for input tensor (i.e. required for RTDETR models)
        im = torch.empty((1, p.shape[1], *imgsz), device=p.device)  # input image in BCHW format
        with torch.profiler.profile(with_flops=True) as prof:
            model(im)
        flops = sum(x.flops for x in prof.key_averages()) / 1e9
    return flops

In [29]:
TORCH_2_0 = True

In [30]:
import thop

In [31]:
from copy import deepcopy

In [32]:
get_flops_with_torch_profiler(model, imgsz=640)

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.HalfTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [33]:
get_flops(model, imgsz=640)

13.609779199999998

In [116]:
not thop

False

In [34]:
model = torch.load('../Models/yolov8s-cls.pt')['model'].model

In [35]:
get_flops(model, imgsz=640)

13.609779199999998

In [46]:
[x.key for x in prof.key_averages()]

['ProfilerStep*',
 'Conv block',
 'aten::conv2d',
 'aten::convolution',
 'aten::_convolution',
 'aten::mkldnn_convolution',
 'aten::empty',
 'aten::as_strided_',
 'aten::resize_',
 'aten::add_',
 'aten::batch_norm',
 'aten::_batch_norm_impl_index',
 'aten::native_batch_norm',
 'aten::empty_like',
 'aten::silu',
 'C2f block',
 'aten::chunk',
 'aten::split',
 'aten::narrow',
 'aten::slice',
 'aten::as_strided',
 'Bottleneck block',
 'aten::add',
 'aten::cat',
 'ClassifyV8',
 'Adaptive Average Pooling',
 'aten::adaptive_avg_pool2d',
 'aten::mean',
 'aten::sum',
 'aten::fill_',
 'aten::div_',
 'aten::to',
 'aten::_to_copy',
 'aten::empty_strided',
 'aten::copy_',
 'Flatten',
 'aten::flatten',
 'aten::view',
 'Linear layer',
 'aten::linear',
 'aten::t',
 'aten::transpose',
 'aten::addmm',
 'aten::expand',
 'aten::resolve_conj',
 'Log Softmax',
 'aten::log_softmax',
 'aten::_log_softmax']

In [40]:
dir(prof.key_averages()[0])[25:]

['__str__',
 '__subclasshook__',
 '__weakref__',
 'add',
 'count',
 'cpu_children',
 'cpu_memory_usage',
 'cpu_parent',
 'cpu_time',
 'cpu_time_str',
 'cpu_time_total',
 'cpu_time_total_str',
 'cuda_time',
 'device_memory_usage',
 'device_time',
 'device_time_str',
 'device_time_total',
 'device_time_total_str',
 'device_type',
 'flops',
 'input_shapes',
 'is_async',
 'is_legacy',
 'is_remote',
 'key',
 'node_id',
 'scope',
 'self_cpu_memory_usage',
 'self_cpu_time_total',
 'self_cpu_time_total_str',
 'self_device_memory_usage',
 'self_device_time_total',
 'self_device_time_total_str',
 'stack',
 'use_device']

In [150]:
[x.cpu_time for x in prof.key_averages()]

[29757.65,
 973.3776923076921,
 577.5357692307693,
 569.7396153846147,
 553.7823076923078,
 528.9579166666668,
 1.6539615384614361,
 2.42124999999955,
 1.0988461538458445,
 6.551923076923146,
 177.62576923076867,
 172.66423076923076,
 161.1023076923071,
 7.619999999999774,
 47.15846153846192,
 4917.837500000002,
 46.46500000000119,
 40.69499999999889,
 10.576363636363894,
 6.216363636363287,
 1.0820833333331317,
 1817.6650000000004,
 67.36499999999994,
 130.67000000000394,
 698.0399999999997,
 690.7900000000016,
 5.3799999999992,
 2411.4100000000035,
 195.11999999999972,
 108.34000000000196,
 102.61000000000058,
 37.14000000000124,
 3.8900000000059665,
 45.59999999999964,
 25.99999999999818,
 21.109999999996944,
 3.8899999999950525,
 5.455000000001382,
 70.36999999999352,
 16.81999999999898,
 490.44000000000597,
 404.99999999999307,
 18.250000000006914,
 8.239999999998327,
 373.4900000000045,
 6.270000000001164,
 0.4250000000009095,
 97.12999999999738,
 21.98000000000102,
 16.150000000

In [13]:
from Models.darknet19 import *

In [14]:
model = Darknet19(num_classes=1000)

In [24]:
get_flops(model, imgsz=224)

5.618206368

In [151]:
prof.key_averages().table()

'--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  Total MFLOPs  \n--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n                   ProfilerStep*         2.94%       8.735ms       100.00%     297.577ms      29.758ms       9.50 Mb    -255.63 Mb            10            --  \n                      Conv block        14.37%      42.772ms        85.05%     253.078ms     973.378us     243.03 Mb           0 b           260            --  \n                    aten::conv2d         0.68%       2.027ms        50.46%     150.159ms     577.536us      80.87 Mb           0 b           260     15245.476  \n               aten::convol

In [25]:
import time

In [26]:
time.time()

1732103673.845408

In [28]:
n=10

In [30]:
tf, tb, t = 0, 0, [0, 0, 0]  # dt forward, backward

In [52]:
m = Darknet19(num_classes=10)

In [53]:
x = img

In [64]:
tf, tb, t = 0, 0, [0, 0, 0]  # dt forward, backward
for _ in range(n):
    t[0] = time.time()
    y = m(x)
    t[1] = time.time()
    tf += (t[1] - t[0]) * 1000 / n  # ms per op forward

In [65]:
tf

405.64892292022705