In [1]:
! nvidia-smi -L | cut -d '(' -f 1

GPU 0: GeForce RTX 2070 SUPER 


In [1]:
import torch
import time
import timeit
import itertools

nb = 500

def main(s: str):
    def prof(file, size, mf, device):
        x = torch.randint(1, 10, size, dtype=torch.float, device=device)
        if mf == 'non_contiguous':
            x = x[::2, ::2, ::2, ::2]
        else:
            x = x.to(memory_format=mf)
        
        net = torch.nn.AdaptiveAvgPool2d((1, 1))
        out = net(x)
        ref_out = x.mean((-1, -2)).view((x.size(0), x.size(1), 1, 1))
        assert torch.allclose(out, ref_out)
        
        # warmup
        timeit.timeit(lambda: net(x), number=nb)
        torch.cuda.synchronize()
        
        start = time.time()
        for _ in range(nb):
            out = net(x)
        torch.cuda.synchronize()
        end = time.time()
        tc = (end - start) / nb
        
        print(f'{size}, {mf}, {device}'.ljust(50), f'{tc : .3e}')
        file.write(f'{size}, {mf}, {device} $ {tc}\n')


    print(torch.__version__)
    with open(s + '.txt', 'w') as file:
        for size, mf, device in itertools.product(
            [(2, 3, 4, 4), (4, 16, 32, 32), (8, 128, 64, 64), (16, 256, 224, 224)],
            [torch.contiguous_format, torch.channels_last, 'non_contiguous'],
            ['cpu', 'cuda']
        ):
            prof(file, size, mf, device)

In [2]:
main('before')

1.7.0a0+06c277f
(2, 3, 4, 4), torch.contiguous_format, cpu          3.459e-05
(2, 3, 4, 4), torch.contiguous_format, cuda         4.128e-05
(2, 3, 4, 4), torch.channels_last, cpu              2.748e-05
(2, 3, 4, 4), torch.channels_last, cuda             3.107e-05
(2, 3, 4, 4), non_contiguous, cpu                   3.695e-05
(2, 3, 4, 4), non_contiguous, cuda                  6.232e-05
(4, 16, 32, 32), torch.contiguous_format, cpu       6.258e-05
(4, 16, 32, 32), torch.contiguous_format, cuda      4.264e-05
(4, 16, 32, 32), torch.channels_last, cpu           5.217e-05
(4, 16, 32, 32), torch.channels_last, cuda          1.895e-04
(4, 16, 32, 32), non_contiguous, cpu                4.813e-05
(4, 16, 32, 32), non_contiguous, cuda               6.162e-05
(8, 128, 64, 64), torch.contiguous_format, cpu      1.201e-04
(8, 128, 64, 64), torch.contiguous_format, cuda     4.271e-05
(8, 128, 64, 64), torch.channels_last, cpu          1.303e-03
(8, 128, 64, 64), torch.channels_last, cuda         1.

In [2]:
main('after')

1.7.0a0+8bffb06
(2, 3, 4, 4), torch.contiguous_format, cpu          3.064e-05
(2, 3, 4, 4), torch.contiguous_format, cuda         3.307e-05
(2, 3, 4, 4), torch.channels_last, cpu              2.707e-05
(2, 3, 4, 4), torch.channels_last, cuda             3.385e-05
(2, 3, 4, 4), non_contiguous, cpu                   2.694e-05
(2, 3, 4, 4), non_contiguous, cuda                  3.310e-05
(4, 16, 32, 32), torch.contiguous_format, cpu       5.311e-05
(4, 16, 32, 32), torch.contiguous_format, cuda      3.229e-05
(4, 16, 32, 32), torch.channels_last, cpu           6.814e-05
(4, 16, 32, 32), torch.channels_last, cuda          4.324e-05
(4, 16, 32, 32), non_contiguous, cpu                4.246e-05
(4, 16, 32, 32), non_contiguous, cuda               3.695e-05
(8, 128, 64, 64), torch.contiguous_format, cpu      1.053e-04
(8, 128, 64, 64), torch.contiguous_format, cuda     4.474e-05
(8, 128, 64, 64), torch.channels_last, cpu          2.625e-04
(8, 128, 64, 64), torch.channels_last, cuda         4.

In [3]:
import re

def readfile(fn):
    with open(fn, 'r') as f:
        fl = f.readlines()
        
    d = {}
    for _line in fl:
        key, t = re.split('\$', _line.rstrip())
        d[key] = float(t) * 1000
    
    return d

def compare(f, before: str, *afters):
    assert len(afters) >= 1, 'provide at least one after data'

    print('shape'.ljust(55), 'time_before (ms), time_after (ms)', end='')
    f.write('| shape | time_before (ms) | time_after (ms) |')
    for after in afters[1:]:
        print(', gpu_time_' + after.rstrip('.txt'), end='')
        f.write(' gpu_time_' + after.rstrip('.txt') + ' (ms) |')
    print()
    f.write('\n')
    f.write('| --- ' * (len(afters) + 2) + '| \n')

    d_b = readfile(before)
    d_as = []
    for after in afters:
        d_a = readfile(after)
        d_as.append(d_a)
    
    for key in d_b:
        time_before = d_b[key]
        time_after = d_as[0][key]

        print(f'{key: <55} {time_before: .3f}, {time_after: .3f}, ' + ' '*5, end='')
        f.write(f'| {key} | {time_before: .3f} | {time_after: .3f} | ')
        for d_a in d_as[1:]:
            time_after = d_a[key]
            print(f'{time_after: .3f}, ', end='')
            f.write(f'{time_after: .3f} |')
        print()
        f.write('\n')

with open('table.md', 'w') as f:
    compare(f, 'before.txt', 'after.txt')

shape                                                   time_before (ms), time_after (ms)
(2, 3, 4, 4), torch.contiguous_format, cpu               0.035,  0.031,      
(2, 3, 4, 4), torch.contiguous_format, cuda              0.041,  0.033,      
(2, 3, 4, 4), torch.channels_last, cpu                   0.027,  0.027,      
(2, 3, 4, 4), torch.channels_last, cuda                  0.031,  0.034,      
(2, 3, 4, 4), non_contiguous, cpu                        0.037,  0.027,      
(2, 3, 4, 4), non_contiguous, cuda                       0.062,  0.033,      
(4, 16, 32, 32), torch.contiguous_format, cpu            0.063,  0.053,      
(4, 16, 32, 32), torch.contiguous_format, cuda           0.043,  0.032,      
(4, 16, 32, 32), torch.channels_last, cpu                0.052,  0.068,      
(4, 16, 32, 32), torch.channels_last, cuda               0.190,  0.043,      
(4, 16, 32, 32), non_contiguous, cpu                     0.048,  0.042,      
(4, 16, 32, 32), non_contiguous, cuda               