In [1]:
import os
import torch
import torch.nn as nn
from torchvision import transforms
from torch.autograd.profiler import record_function

In [2]:
import sys
sys.path.insert(1, "../../../")

In [3]:
from data_preprocessing import get_means, get_stds, ImageNetSubset
from Models.yolov8cls_path import Model

In [4]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu') 
device=torch.device('cpu') 

In [5]:
data_path = '../../../../dummy_datasets/'
norms_path = os.path.join(data_path, 'norms.json')

In [6]:
means = get_means(path=norms_path, train_loader=None)
stds = get_stds(path=norms_path, train_loader=None)

Means are: [0.4405549168586731, 0.4407285749912262, 0.4381718039512634]
stds are: [0.25142669677734375, 0.25270089507102966, 0.25131651759147644]


# Inference at 640x640

In [7]:
transformations = transforms.Compose([transforms.ToTensor(),
                                      transforms.Resize((640, 640)),
                                      transforms.Normalize(mean=means, std=stds)])

In [8]:
train_dataset = ImageNetSubset(path=data_path, train=True, transform=transformations, half=False, show=False)

In [9]:
from torch.profiler import profile, ProfilerActivity

In [10]:
model = Model(num_classes=1000, 
              residual_connection=True, 
              CSP=True, 
              add_hidden=True,
              classifyV8=False,
              bottleneck=1.0, 
              variant='s', 
              device=device, 
              dtype=torch.float32)

In [11]:
model.eval()
model.training

False

In [12]:
next(model.parameters()).device

device(type='cpu')

In [26]:
img.device

device(type='cpu')

In [25]:
img = train_dataset[6][0].unsqueeze(0).to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                                    schedule=torch.profiler.schedule(wait=1, warmup=1, active=10, repeat=1),
                                    on_trace_ready=torch.profiler.tensorboard_trace_handler('../../log/YOLOv8cls-version-4/inference_at_640'),
                                    record_shapes=True,
                                    profile_memory=True,
                                    with_flops=True,
                                    with_modules=True,
                                    with_stack=False) as prof:
    for i in range(12):
        # torch.cuda.synchronize(device=device)
        with record_function('inference'):
            with torch.no_grad():
                model(img)
        # torch.cuda.synchronize(device=device)
        prof.step()
print(prof.key_averages(group_by_input_shape=True).table(row_limit=-1))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls                                                                      Input Shapes  Total MFLOPs  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                   ProfilerStep*         0.08%     937.200us       100.00%        1.207s     120.714ms

In [28]:
img = train_dataset[6][0].unsqueeze(0).to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                                    schedule=torch.profiler.schedule(wait=1, warmup=1, active=1, repeat=1),
                                    on_trace_ready=torch.profiler.tensorboard_trace_handler('../../log/YOLOv8cls-version-4/inference_at_640'),
                                    record_shapes=True,
                                    profile_memory=True,
                                    with_flops=True,
                                    with_modules=True,
                                    with_stack=False) as prof:
    for i in range(3):
        # torch.cuda.synchronize(device=device)
        with record_function('inference'):
            with torch.no_grad():
                model(img)
        # torch.cuda.synchronize(device=device)
        prof.step()
print(prof.key_averages(group_by_input_shape=True).table(row_limit=-1))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls                                                                      Input Shapes  Total KFLOPs  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                   ProfilerStep*         0.09%     112.500us       100.00%     121.654ms     121.654ms

# Inference at 224x224

In [29]:
transformations = transforms.Compose([transforms.ToTensor(),
                                      transforms.Resize((224, 224)),
                                      transforms.Normalize(mean=means, std=stds)])

In [30]:
train_dataset = ImageNetSubset(path=data_path, train=True, transform=transformations, half=False, show=False)

In [31]:
img = train_dataset[0][0].unsqueeze(0)

In [32]:
from torch.profiler import profile, ProfilerActivity

In [33]:
model = Model(num_classes=1000, 
              residual_connection=True, 
              CSP=True, 
              add_hidden=True,
              classifyV8=False,
              bottleneck=1.0, 
              variant='s', 
              device=device, 
              dtype=torch.float32)

In [35]:
model.eval()
model.training

False

In [36]:
next(model.parameters()).device

device(type='cpu')

In [None]:
img.device

In [41]:
img = train_dataset[6][0].unsqueeze(0).to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                                    schedule=torch.profiler.schedule(wait=1, warmup=1, active=10, repeat=1),
                                    on_trace_ready=torch.profiler.tensorboard_trace_handler('../../log/YOLOv8cls-version-4/inference_at_224'),
                                    record_shapes=True,
                                    profile_memory=True,
                                    with_flops=True,
                                    with_modules=True,
                                    with_stack=False) as prof:
    for i in range(12):
        # torch.cuda.synchronize(device=device)
        with record_function('inference'):
            with torch.no_grad():
                model(img)
        # torch.cuda.synchronize(device=device)
        prof.step()
print(prof.key_averages(group_by_input_shape=True).table(row_limit=-1))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls                                                                      Input Shapes  Total KFLOPs  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                   ProfilerStep*         0.18%     803.600us       100.00%     438.284ms      43.828ms

In [44]:
img = train_dataset[6][0].unsqueeze(0).to(device)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                                    schedule=torch.profiler.schedule(wait=1, warmup=1, active=1, repeat=1),
                                    on_trace_ready=torch.profiler.tensorboard_trace_handler('../../log/YOLOv8cls-version-4/inference_at_224'),
                                    record_shapes=True,
                                    profile_memory=True,
                                    with_flops=True,
                                    with_modules=True,
                                    with_stack=False) as prof:
    for i in range(3):
        # torch.cuda.synchronize(device=device)
        with record_function('inference'):
            with torch.no_grad():
                model(img)
        # torch.cuda.synchronize(device=device)
        prof.step()
print(prof.key_averages(group_by_input_shape=True).table(row_limit=-1))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls                                                                      Input Shapes  Total KFLOPs  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ------------  
                   ProfilerStep*         0.17%      75.400us       100.00%      44.639ms      44.639ms

# GFLOPs at 640x640 using Ultralytics functions

In [46]:
from ultralytics_flops import get_flops, get_flops_with_torch_profiler

In [47]:
get_flops(model, imgsz=640)

12.4665472

In [48]:
get_flops_with_torch_profiler(model, imgsz=640)

12.3327488

# GFLOPs at 224x224 using Ultralytics functions

In [46]:
from ultralytics_flops import get_flops, get_flops_with_torch_profiler

In [49]:
get_flops(model, imgsz=224)

1.527152032

In [50]:
get_flops_with_torch_profiler(model, imgsz=224)

1.5107617279999999

# Count Parameters

In [51]:
from evaluation import count_parameters, show_tree, generate_dataframe

In [52]:
count_parameters(model)

4934816