In [16]:
import os
import torch
import torch.nn as nn
import pandas as pd
from torchvision import transforms

In [17]:
import sys
sys.path.insert(1, "../")

In [18]:
from data_preprocessing import get_means, get_stds, ImageNetSubset
from Models.darknet19 import Darknet19
from evaluation import count_parameters

In [19]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu') 

In [20]:
data_path = '../../dummy_datasets/'
norms_path = os.path.join(data_path, 'norms.json')

In [21]:
means = get_means(path=norms_path, train_loader=None)
stds = get_stds(path=norms_path, train_loader=None)

Means are: [0.4405549168586731, 0.4407285749912262, 0.4381718039512634]
stds are: [0.25142669677734375, 0.25270089507102966, 0.25131651759147644]


# Inference at 224x224

In [57]:
transformations = transforms.Compose([transforms.ToTensor(),
                                      transforms.Resize((224, 224)),
                                      transforms.Normalize(mean=means, std=stds)])

In [58]:
train_dataset = ImageNetSubset(path=data_path, train=True, transform=transformations, half=False, show=False)

In [59]:
model = Darknet19(num_classes=1000, device=device, dtype=torch.float32)

In [60]:
img = train_dataset[0][0].unsqueeze(0)

In [61]:
from torch.profiler import profile, ProfilerActivity

In [62]:
with profile(activities=[ProfilerActivity.CPU],
                                    schedule=torch.profiler.schedule(wait=1, warmup=1, active=10, repeat=1),
                                    on_trace_ready=torch.profiler.tensorboard_trace_handler('../log/darknet19/inference_at_224'),
                                    record_shapes=True,
                                    profile_memory=True,
                                    with_flops=True,
                                    with_modules=True,
                                    with_stack=True) as prof:
    for i in range(12):
        out = model(img)
        prof.step()
print(prof.key_averages().table(row_limit=-1))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  Total GFLOPs  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                    ProfilerStep*         1.83%      13.807ms       100.00%     755.788ms      75.579ms      13.93 Mb    -592.46 Mb            10            --  
                       Conv block         6.54%      49.420ms        82.04%     620.028ms       3.445ms     517.35 Mb           0 b           180            --  
                     aten::conv2d         0.23%       1.736ms        62.88%     475.212ms       2.501ms     174.13 Mb           0 b           190        55.820  
                aten::convol

In [38]:
def ___get_cpu_children(obj, attrs, r=0, n=90):
    children = obj.cpu_children
    if 'name' in dir(obj):
        name = ' '*3*r+obj.name
        offset = ' '*(n - len(name))
        
        tr = obj.time_range
        print(name + offset + str(tr.start) + ' ' + str(tr.end))
    else:
        print('\t'*r+obj.key)
    if len(children) > 0:
        for child in children:
            get_cpu_children(child, r=r+1, n=n)

In [39]:
def generate_dataframe(prof):

    def get_cpu_children(obj, attrs, _rows, r=0):
        row = {}
        for attr in attrs:
            if attr in dir(obj):
                row[attr] = getattr(obj, attr)
            else:
                row[attr] = None
        row['r'] = r
        _rows.append(row)
        children = row['cpu_children']
        if len(children) > 0:
            for child in children:
                _rows = get_cpu_children(child, attrs, _rows, r=r+1)
        return _rows
    
    step = prof.key_averages()[0]
    _attrs = dir(prof.key_averages()[0].cpu_children[0])
    _attrs = [child for child in _attrs if child[:2] != '__' 
                   and child not in ['set_cpu_parent', 'append_cpu_child', 'append_kernel', 'set_cpu_parent']]
    _rows = []
    _rows = get_cpu_children(step, _attrs, _rows)

    for row in _rows:
        row['cpu_children'] = [child.id for child in row['cpu_children']]
        row['cpu_parent'] = None if row['cpu_parent'] is None else row['cpu_parent'].id
        row['tr.start'] =  None if row['time_range'] is None else row['time_range'].start
        row['tr.end'] = None if row['time_range'] is None else row['time_range'].end
        row.pop('time_range', None)
        
    df = pd.DataFrame(_rows)
    return df

In [40]:
df = generate_dataframe(prof)

  row[attr] = getattr(obj, attr)
  row[attr] = getattr(obj, attr)
  row[attr] = getattr(obj, attr)
  row[attr] = getattr(obj, attr)


In [56]:
df[['id', 'key', 'r', 'tr.start', 'tr.end', 'self_cpu_memory_usage', 'flops', 'cpu_children', 'cpu_time_total', 'cpu_time_total_str', 'input_shapes']].iloc[1:25]

Unnamed: 0,id,key,r,tr.start,tr.end,self_cpu_memory_usage,flops,cpu_children,cpu_time_total,cpu_time_total_str,input_shapes
1,9218.0,Conv block,1,1160.38,7778.08,0,0,"[9219, 9227, 9228, 9240]",6617.7,6.618ms,[]
2,9219.0,aten::conv2d,2,1300.18,3422.28,0,86704128,[9220],2122.1,2.122ms,"[[1, 3, 224, 224], [32, 3, 3, 3], [], [], [], ..."
3,9220.0,aten::convolution,3,1309.68,3420.48,0,0,[9221],2110.8,2.111ms,"[[1, 3, 224, 224], [32, 3, 3, 3], [], [], [], ..."
4,9221.0,aten::_convolution,4,1329.88,3407.48,0,0,[9222],2077.6,2.078ms,"[[1, 3, 224, 224], [32, 3, 3, 3], [], [], [], ..."
5,9222.0,aten::mkldnn_convolution,5,1345.58,3404.38,0,0,"[9223, 9224, 9225, 9226]",2058.8,2.059ms,"[[1, 3, 224, 224], [32, 3, 3, 3], [], [], [], ..."
6,9223.0,aten::empty,6,1359.48,1364.88,0,0,[],5.4,5.400us,"[[], [], [], [], [], []]"
7,9224.0,aten::empty,6,2217.88,2223.58,6422528,0,[],5.7,5.700us,"[[], [], [], [], [], []]"
8,9225.0,aten::as_strided_,6,3371.48,3380.18,0,0,[],8.7,8.700us,"[[1, 32, 224, 224], [], [], []]"
9,9226.0,aten::resize_,6,3388.68,3390.98,0,0,[],2.3,2.300us,"[[1, 32, 224, 224], [], []]"
10,9227.0,aten::add_,2,3586.98,3611.28,0,0,[],24.3,24.300us,"[[], [], []]"


In [52]:
df.iloc[-1]

concrete_inputs                  [None, 1, False]
count                                           1
cpu_children                                   []
cpu_memory_usage                             4000
cpu_parent                                 9671.0
cpu_time                                     17.4
cpu_time_str                             17.400us
cpu_time_total                               17.4
cpu_time_total_str                       17.400us
cuda_time                                     0.0
cuda_time_total                               0.0
device_index                              27076.0
device_memory_usage                             0
device_resource_id                        29128.0
device_time                                   0.0
device_time_str                           0.000us
device_time_total                               0
device_time_total_str                     0.000us
device_type                        DeviceType.CPU
flops                                           0


In [51]:
6422528/1000/10

642.2528

In [34]:
df.columns

Index(['concrete_inputs', 'count', 'cpu_children', 'cpu_memory_usage',
       'cpu_parent', 'cpu_time', 'cpu_time_str', 'cpu_time_total',
       'cpu_time_total_str', 'cuda_time', 'cuda_time_total', 'device_index',
       'device_memory_usage', 'device_resource_id', 'device_time',
       'device_time_str', 'device_time_total', 'device_time_total_str',
       'device_type', 'flops', 'fwd_thread', 'id', 'input_shapes', 'is_async',
       'is_legacy', 'is_remote', 'kernels', 'key', 'name', 'node_id', 'scope',
       'self_cpu_memory_usage', 'self_cpu_time_total',
       'self_cpu_time_total_str', 'self_cuda_memory_usage',
       'self_cuda_time_total', 'self_device_memory_usage',
       'self_device_time_total', 'self_device_time_total_str', 'sequence_nr',
       'stack', 'thread', 'trace_name', 'use_device', 'r', 'tr.start',
       'tr.end'],
      dtype='object')

In [168]:
def move(_from, _to, _l):
    _l.insert(_to, _l.pop(_from))

In [36]:
prof.key_averages()[0]

<FunctionEventAvg key=ProfilerStep* self_cpu_time=11.346ms cpu_time=64.290ms  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=55556256 cuda_memory_usage=0>