In [3]:
from timeit import default_timer as timer
import torch
import torch.nn as nn
from models import vgg,layers
import torch.utils.benchmark as benchmark
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
@torch.no_grad()
def measure_time_host(
    model: nn.Module,
    input_tensor: torch.Tensor,
    num_repeats: int = 100,
    num_warmups: int = 10,
    synchronize: bool = True,
    continuous_measure: bool = False,
) -> float:

    for _ in range(num_warmups):
        _ = model.forward(input_tensor)
    torch.cuda.synchronize()

    elapsed_time_ms = 0

    if continuous_measure:
        start = timer()
        for _ in range(num_repeats):
            _ = model.forward(input_tensor)
        if synchronize:
            torch.cuda.synchronize()
        end = timer()
        elapsed_time_ms = (end - start) * 1000

    else:
        for _ in range(num_repeats):
            start = timer()
            _ = model.forward(input_tensor)
            if synchronize:
                torch.cuda.synchronize()
            end = timer()
            elapsed_time_ms += (end - start) * 1000

    return elapsed_time_ms / num_repeats

@torch.no_grad()
def measure_time_device(
    model: nn.Module,
    input_tensor: torch.Tensor,
    num_repeats: int = 100,
    num_warmups: int = 10,
    synchronize: bool = True,
    continuous_measure: bool = False,
) -> float:

    for _ in range(num_warmups):
        _ = model.forward(input_tensor)
    torch.cuda.synchronize()

    elapsed_time_ms = 0

    if continuous_measure:
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()
        for _ in range(num_repeats):
            _ = model.forward(input_tensor)
        end_event.record()
        if synchronize:
            # This has to be synchronized to compute the elapsed time.
            # Otherwise, there will be runtime error.
            torch.cuda.synchronize()
        elapsed_time_ms = start_event.elapsed_time(end_event)

    else:
        for _ in range(num_repeats):
            start_event = torch.cuda.Event(enable_timing=True)
            end_event = torch.cuda.Event(enable_timing=True)
            start_event.record()
            _ = model.forward(input_tensor)
            end_event.record()
            if synchronize:
                # This has to be synchronized to compute the elapsed time.
                # Otherwise, there will be runtime error.
                torch.cuda.synchronize()
            elapsed_time_ms += start_event.elapsed_time(end_event)

    return elapsed_time_ms / num_repeats

In [3]:
@torch.no_grad()
def run_inference(model: nn.Module,
                  input_tensor: torch.Tensor) -> torch.Tensor:

    return model.forward(input_tensor)

In [4]:
def main(model, input_tensor) -> None:

    num_warmups = 100
    num_repeats = 1000
    input_shape = (1,224, 224,4)

    device = torch.device("cuda:0")

    # model = torchvision.models.resnet18(pretrained=False)

    model.eval()
    model.cuda()

    # Input tensor
    input_tensor = input_tensor.cuda()

    torch.cuda.synchronize()

    print("Latency Measurement Using CPU Timer...")
    for continuous_measure in [True, False]:
        for synchronize in [True, False]:
            try:
                latency_ms = measure_time_host(
                    model=model,
                    input_tensor=input_tensor,
                    num_repeats=num_repeats,
                    num_warmups=num_warmups,
                    synchronize=synchronize,
                    continuous_measure=continuous_measure,
                )
                print(f"|"
                      f"Synchronization: {synchronize!s:5}| "
                      f"Continuous Measurement: {continuous_measure!s:5}| "
                      f"Latency: {latency_ms:.5f} ms| ")
            except Exception as e:
                print(f"|"
                      f"Synchronization: {synchronize!s:5}| "
                      f"Continuous Measurement: {continuous_measure!s:5}| "
                      f"Latency: N/A     ms| ")
            torch.cuda.synchronize()

    print("Latency Measurement Using CUDA Timer...")
    for continuous_measure in [True, False]:
        for synchronize in [True, False]:
            try:
                latency_ms = measure_time_device(
                    model=model,
                    input_tensor=input_tensor,
                    num_repeats=num_repeats,
                    num_warmups=num_warmups,
                    synchronize=synchronize,
                    continuous_measure=continuous_measure,
                )
                print(f"|"
                      f"Synchronization: {synchronize!s:5}| "
                      f"Continuous Measurement: {continuous_measure!s:5}| "
                      f"Latency: {latency_ms:.5f} ms| ")
            except Exception as e:
                print(f"|"
                      f"Synchronization: {synchronize!s:5}| "
                      f"Continuous Measurement: {continuous_measure!s:5}| "
                      f"Latency: N/A     ms| ")
            torch.cuda.synchronize()

    print("Latency Measurement Using PyTorch Benchmark...")
    num_threads = 1
    timer = benchmark.Timer(stmt="run_inference(model, input_tensor)",
                            setup="from __main__ import run_inference",
                            globals={
                                "model": model,
                                "input_tensor": input_tensor
                            },
                            num_threads=num_threads,
                            label="Latency Measurement",
                            sub_label="torch.utils.benchmark.")

    profile_result = timer.timeit(num_repeats)
    # https://pytorch.org/docs/stable/_modules/torch/utils/benchmark/utils/common.html#Measurement
    print(f"Latency: {profile_result.mean * 1000:.5f} ms")


if __name__=="__main__":
    print("--------- TOTAL PYTORCH MODEL -----------")
    model = vgg.vgg16()
    tensor = torch.rand((1,4, 224,224))
    main(model, tensor)
    print("--------- TOTAL FLOAT MODEL -----------")
    model = vgg.float_vgg16()
    tensor = torch.rand((1,224,224,4))
    main(model, tensor)
    print("\n\n--------- TOTAL INT8 MODEL -----------")
    model = vgg.int_vgg16()
    tensor = torch.randint(-128,127,(1,224,224,4),dtype=torch.int8)
    main(model, tensor)
    print("\n\n--------- PYTORCH Conv2d 1 LAYER -----------")
    layer = nn.Conv2d(4,64,3)
    tensor = torch.rand((1,4, 224,224))
    main(layer, tensor)
    print("\n\n--------- FloatConv2d 1 LAYER -----------")
    layer = layers.FLOATConv2d(4,64,3)
    tensor = torch.rand((1,224,224,4))
    main(layer, tensor)
    print("\n\n--------- IntConv2d 1 LAYER -----------")
    layer = layers.IntConv2d(4,64,3)
    tensor = torch.randint(-128,127,(1,224,224,4),dtype=torch.int8)
    main(layer, tensor)

    print("\n\n--------- PYTORCH Linear 1 LAYER -----------")
    layer = nn.Linear(25088,4096)
    tensor = torch.rand((1,25088))
    main(layer, tensor)
    print("\n\n--------- FloatLinear 1 LAYER -----------")
    layer = layers.FLOATLinear(25088,4096)
    tensor = torch.rand((1,25088))
    main(layer, tensor)
    print("\n\n--------- IntLinear 1 LAYER -----------")
    layer = layers.IntLinear(25088,4096)
    tensor = torch.randint(-128,127,(1,25088),dtype=torch.int8)
    main(layer, tensor)

--------- TOTAL PYTORCH MODEL -----------
Latency Measurement Using CPU Timer...
|Synchronization: True | Continuous Measurement: True | Latency: 2.77948 ms| 
|Synchronization: False| Continuous Measurement: True | Latency: 2.74886 ms| 
|Synchronization: True | Continuous Measurement: False| Latency: 2.81305 ms| 
|Synchronization: False| Continuous Measurement: False| Latency: 2.70466 ms| 
Latency Measurement Using CUDA Timer...
|Synchronization: True | Continuous Measurement: True | Latency: 2.74054 ms| 
|Synchronization: False| Continuous Measurement: True | Latency: N/A     ms| 
|Synchronization: True | Continuous Measurement: False| Latency: 2.69934 ms| 
|Synchronization: False| Continuous Measurement: False| Latency: N/A     ms| 
Latency Measurement Using PyTorch Benchmark...
Latency: 2.74536 ms
--------- TOTAL FLOAT MODEL -----------
Latency Measurement Using CPU Timer...
|Synchronization: True | Continuous Measurement: True | Latency: 5.96707 ms| 
|Synchronization: False| Contin

In [5]:


model = vgg.vgg16().cuda()
tensor = torch.rand((1,4, 224,224)).cuda()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as prof:
    with torch.no_grad():
        y = model(tensor)

STAGE:2023-11-25 18:33:49 1805140:1805140 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-11-25 18:33:49 1805140:1805140 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-11-25 18:33:49 1805140:1805140 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [6]:
print(prof.key_averages().table())

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         5.20%     186.000us        52.31%       1.870ms     143.846us       0.000us         0.00%       2.029ms     156.077us           0 b           0 b      54.34 Mb      11.81 M

In [4]:
tensor = torch.rand((1,224,224,4)).cuda()
layer = layers.FLOATConv2d(4,64)
layer.cuda()
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as prof:
    with torch.no_grad():
        y = layer(tensor)

STAGE:2023-11-25 19:57:02 1850541:1850541 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-11-25 19:57:03 1850541:1850541 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-11-25 19:57:03 1850541:1850541 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [5]:
print(prof.key_averages().table())

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     cudaGetDeviceCount         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 

In [7]:
tensor = torch.randint(-128,127,(1,224,224,4), dtype=torch.int8).cuda()
layer = layers.IntConv2d(4,64)
layer.cuda()
with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as prof:
    with torch.no_grad():
        y = layer(tensor)

STAGE:2023-11-25 19:58:42 1850541:1850541 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-11-25 19:58:42 1850541:1850541 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-11-25 19:58:42 1850541:1850541 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [8]:
print(prof.key_averages().table())

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                  cudaStreamIsCapturing         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           0 b           0 b             1  
                                  cudaStreamGetPriority         0.00%       0.000us         0.0