In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import optim

In [2]:
import sys
sys.path.insert(1, "../")

In [3]:
from train import *
from data_preprocessing import *
from Models.darknet19 import *

In [4]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu') 

In [5]:
model = Darknet19(num_classes=10)

In [6]:
data_path = '../../dummy_datasets/'
norms_path = os.path.join(data_path, 'norms.json')

In [7]:
means = get_means(path=norms_path, train_loader=None)
stds = get_stds(path=norms_path, train_loader=None)

Means are: [0.4405549168586731, 0.4407285749912262, 0.4381718039512634]
stds are: [0.25142669677734375, 0.25270089507102966, 0.25131651759147644]


In [8]:
transformations = transforms.Compose([transforms.ToTensor(),
                                      transforms.Resize((224, 224)),
                                      transforms.Normalize(mean=means, std=stds)])

In [9]:
train_dataset = ImageNetSubset(path=data_path, train=True, transform=transformations, half=False, show=False)
val_dataset = ImageNetSubset(path=data_path, train=False, transform=transformations, half=False, show=False)

In [10]:
epochs=3
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=4)

In [11]:
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0001)

In [12]:
loss_fn = nn.NLLLoss()

In [13]:
prof = torch.profiler.profile(
        # schedule=torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/darknet19'),
        record_shapes=True,
        profile_memory=True,
        with_flops=True,
        with_modules=True,
        with_stack=True)


Profiling your personal module 
https://pytorch.org/tutorials/beginner/profiler.html

In [14]:
darknet_history, darknet_gradient_stats, prof = train(epochs, train_loader, val_loader, model, optimizer, loss_fn, prof)

2024-11-16 18:25:15.257475 Epoch 1: 
2024-11-16 18:25:21.629539 Batch 1: 
torch.Size([30, 10, 1, 1])
torch.Size([30, 10])
[Train] Accuracy: 13.3333, Loss per batch: 2.2849
2024-11-16 18:25:33.310796 Batch 1: 
torch.Size([256, 10, 1, 1])
torch.Size([256, 10])
2024-11-16 18:25:44.577420 Batch 2: 
torch.Size([244, 10, 1, 1])
torch.Size([244, 10])
[Val] Accuracy: 10.0%, loss per batch: 2.3026
2024-11-16 18:25:55.961150 Epoch 2: 
2024-11-16 18:26:04.550771 Batch 1: 
torch.Size([30, 10, 1, 1])
torch.Size([30, 10])
[Train] Accuracy: 23.3333, Loss per batch: 2.2079
2024-11-16 18:26:16.975491 Batch 1: 
torch.Size([256, 10, 1, 1])
torch.Size([256, 10])
2024-11-16 18:26:26.980195 Batch 2: 
torch.Size([244, 10, 1, 1])
torch.Size([244, 10])
[Val] Accuracy: 10.0%, loss per batch: 2.3027
2024-11-16 18:26:39.092225 Epoch 3: 
2024-11-16 18:26:45.962882 Batch 1: 
torch.Size([30, 10, 1, 1])
torch.Size([30, 10])
[Train] Accuracy: 40.0, Loss per batch: 2.0945
2024-11-16 18:26:59.123262 Batch 1: 
torch.Size

In [40]:
print(prof)

<torch.profiler.profiler.profile object at 0x00000241AC07F920>


## Inference time
## Training time

In [14]:
img = train_dataset[0][0].unsqueeze(0)

From YOLOv2 paper: Darknet-19 only requires 5.58 billion operations

In [34]:
with torch.autograd.profiler.profile(use_device='cpu',
                                    record_shapes=True,
                                    profile_memory=True,
                                    with_flops=True,
                                    with_modules=True,
                                    with_stack=True) as prof:
   out = model(img)
print(prof.key_averages().table(row_limit=-1))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  Total GFLOPs  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Conv block         6.79%       7.276ms        85.61%      91.696ms       5.094ms      51.73 Mb           0 b            18            --  
                     aten::conv2d         0.17%     184.700us        62.52%      66.961ms       3.524ms      17.23 Mb           0 b            19         5.483  
                aten::convolution         0.44%     469.900us        62.35%      66.776ms       3.515ms      17.23 Mb           0 b            19            --  
               aten::_convol

In [19]:
85.45+14.01+0.17+0.06+0.08

99.77000000000001

In [27]:
88.930+14.583+0.180900+0.062100+0.088200 # CPU Total (time, ms)

103.8442

In [30]:
104.072*0.9977 # CPU Total % is the same thing as CPU Total, just in relative values

103.8326344

In [26]:
5.483 #GFLOPS

5.483

Conclusion: my decorated functions can account for 99.77% of operations that are happening during inference. There are marginal inaccuracies in CPU Total % and CPU Total but those are insignificant. GFLOPs counted by the profiler almost matches the one that was mentioned in the original paper. In the paper it was 5.58GFLOPs, the profiler states that my implementation takes 5.48GFLOPs

In [45]:
from torch.profiler import profile, record_function, ProfilerActivity

Conclusion:
- I can record how much time and memory every operation took
- Which means I can find computational bottlenecks
- Using tensor boards, I can look up how much time was the memory allocated by every operation
- I can write my own operators, I think
- I do not need to rewrite DataLoader, as it basically occupies no time or memory

In [48]:
with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
    with record_function("model_inference"):
        model(img)

In [54]:
print(prof.key_averages().table())

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         4.18%       4.529ms       100.00%     108.450ms     108.450ms           0 b     -60.64 Mb             1  
                     aten::conv2d         0.15%     166.100us        68.60%      74.399ms       3.916ms      17.23 Mb           0 b            19  
                aten::convolution         0.54%     589.100us        68.45%      74.233ms       3.907ms      17.23 Mb           0 b            19  
               aten::_convolution         0.31%     334.000us        67.91%      73.644ms       3.876ms      17.

In [57]:
prof.key_averages()[0]

<FunctionEventAvg key=aten::conv2d self_cpu_time=143.900us cpu_time=3.958ms  self_cuda_time=0.000us cuda_time=0.000us input_shapes= cpu_memory_usage=18065320 cuda_memory_usage=0>

There is also cuda_memory_usage, so I can measure cuda memory and time as well