In [1]:
! nvidia-smi -L | cut -d '(' -f 1

GPU 0: GeForce RTX 2070 SUPER 


In [1]:
import torch
import time
import itertools

nb = 500

def main(s: str):
    def prof(b_, n_, dtype, f):
        # print(b_, n_)
        x = torch.randn(*b_, n_, n_, device='cuda', dtype=dtype)

        xc = x.clone().cpu()

        t1 = time.time()
        for _ in range(nb):
            yc = torch.inverse(xc)
        t2 = time.time()
        cpu_time = (t2-t1)/nb*1e3
        # print('cpu', cpu_time, 'ms')

        for _ in range(nb):
            y = torch.inverse(x)
        torch.cuda.synchronize()

        c, d = torch.testing._compare_tensors_internal(xc.cuda(), x, rtol=1e-7, atol=1e-7, equal_nan=False)
        if not c:
            print('original matrix compare')
            print(d)
            raise RuntimeError('original value modified')

        torch.cuda.synchronize()

        t1 = time.time()
        for _ in range(nb):
            y = torch.inverse(x)
        torch.cuda.synchronize()
        t2 = time.time()
        gpu_time = (t2-t1)/nb*1e3
        # print('gpu', gpu_time, 'ms')

        a, b = torch.testing._compare_tensors_internal(yc.cuda(), y, rtol=1e-3, atol=1e-3, equal_nan=False)
        if not a:
            print('numerical mismatch: inverse value compare')
            print(b)

        print(f'{b_} {n_} {dtype}'.ljust(35) + f'{cpu_time : .3f}  {gpu_time : .3f}')
        f.write(f'{b_} {n_} {dtype}; ' + f'{cpu_time : .3e}, {gpu_time : .3e}\n')
        torch.cuda.synchronize()
    
    print(s)
    print(torch.__version__)
    print()
    print('batch_size, matrix_size, dtype'.ljust(35) + 'cpu_time(ms), gpu_time(ms)')
    
    shapes = itertools.product(
        [[]] + [[2**x] for x in range(3)],
        [2**i for i in range(1, 11)],
        [torch.float]
    )

    with open(s+'.txt', 'w') as f:
        for b, n, dtype in shapes:
            if len(b) > 0 and b[0] * n >= 2**15:
                continue
            prof(b, n, dtype, f)


In [2]:
main('before')

before
1.7.0a0+71510c6

batch_size, matrix_size, dtype     cpu_time(ms), gpu_time(ms)
[] 2 torch.float32                  0.121   7.318
[] 4 torch.float32                  0.009   7.370
[] 8 torch.float32                  0.011   7.376
[] 16 torch.float32                 0.039   7.359
[] 32 torch.float32                 0.102   7.437
[] 64 torch.float32                 0.148   7.568
[] 128 torch.float32                0.467   7.896
[] 256 torch.float32                1.135   11.618
[] 512 torch.float32                5.214   14.835
[] 1024 torch.float32               19.311   18.583
[1] 2 torch.float32                 0.009   0.110
[1] 4 torch.float32                 0.009   0.113
[1] 8 torch.float32                 0.010   0.113
[1] 16 torch.float32                0.015   0.119
[1] 32 torch.float32                0.031   0.196
[1] 64 torch.float32                0.068   0.422
[1] 128 torch.float32               0.330   0.803
[1] 256 torch.float32               1.072   1.684
[1] 512 to

In [2]:
main('after')

after
1.7.0a0+1e36b48

batch_size, matrix_size, dtype     cpu_time(ms), gpu_time(ms)
[] 2 torch.float32                  0.010   0.124
[] 4 torch.float32                  0.009   0.123
[] 8 torch.float32                  0.010   0.136
[] 16 torch.float32                 0.016   0.132
[] 32 torch.float32                 0.032   0.185
[] 64 torch.float32                 0.072   0.268
[] 128 torch.float32                0.349   0.491
[] 256 torch.float32                1.175   1.078
[] 512 torch.float32                5.042   2.636
numerical mismatch: inverse value compare
With rtol=0.001 and atol=0.001, found 653019 element(s) (out of 1048576) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0722208023071289 (10.46250057220459 vs. 10.534721374511719), which occurred at index (513, 268).
[] 1024 torch.float32               18.959   7.172
[1] 2 torch.float32                 0.010   0.130
[1] 4 torch.float32                 0.010 

In [3]:
import re

def readfile(fn):
    with open(fn, 'r') as f:
        fl = f.readlines()
    
    dc = {}
    dg = {}
    for _line in fl:
        key, cpu_time, gpu_time = re.split(';|,', _line.rstrip())
        dc[key] = float(cpu_time)
        dg[key] = float(gpu_time)
    
    return (dc, dg)

def compare(f, before: str, *afters):
    assert len(afters) >= 1, 'provide at least one after data'

    print('shape'.ljust(26), 'cpu_time, gpu_time_before (magma)', end='')
    f.write('| shape | cpu_time (ms) | gpu_time_before (magma) (ms) |')
    for after in afters:
        print(', gpu_time_' + after.rstrip('.txt'), end='')
        f.write(' gpu_time_' + after.rstrip('.txt') + ' (ms) |')
    print()
    f.write('\n')
    f.write('| --- ' * (len(afters) + 3) + '| \n')

    dc_b, dg_b = readfile(before)
    dc_as = []
    dg_as = []
    for after in afters:
        dc_a, dg_a = readfile(after)
        dc_as.append(dc_a)
        dg_as.append(dg_a)
    
    for key in dc_b:
        cpu_time = (dc_b[key] + sum(dc_a[key] for dc_a in dc_as)) / (1 + len(dc_as))
        gpu_time_before = dg_b[key]
        gpu_time_after = dg_as[0][key]
        
        if gpu_time_after > gpu_time_before:
            gs = ' ' * 5 + '*' * 20 + ' regressed'
            gss = '***regressed'
        else:
            gs = ''
            gss = ''

        print(f'{key: <26} {cpu_time: .3f}, {gpu_time_before: .3f}, {gpu_time_after: .3f}, ' + ' '*5, end='')
        f.write(f'| {key} | {cpu_time: .3f} | {gpu_time_before: .3f} | {gpu_time_after: .3f} {gss} | ')
        for dg_a in dg_as[1:]:
            gpu_time_after = dg_a[key]
            print(f'{gpu_time_after: .3f}, ', end='')
            f.write(f'{gpu_time_after: .3f} |')
        print(gs)
        f.write('\n')

with open('table.md', 'w') as f:
    compare(f, 'before.txt', 'after.txt')

shape                      cpu_time, gpu_time_before (magma), gpu_time_after
[] 2 torch.float32          0.066,  7.318,  0.124,      
[] 4 torch.float32          0.009,  7.370,  0.123,      
[] 8 torch.float32          0.011,  7.376,  0.136,      
[] 16 torch.float32         0.027,  7.359,  0.132,      
[] 32 torch.float32         0.067,  7.437,  0.185,      
[] 64 torch.float32         0.110,  7.568,  0.268,      
[] 128 torch.float32        0.408,  7.896,  0.491,      
[] 256 torch.float32        1.155,  11.620,  1.078,      
[] 512 torch.float32        5.128,  14.830,  2.636,      
[] 1024 torch.float32       19.135,  18.580,  7.172,      
[1] 2 torch.float32         0.009,  0.110,  0.130,           ******************** regressed
[1] 4 torch.float32         0.009,  0.113,  0.131,           ******************** regressed
[1] 8 torch.float32         0.011,  0.113,  0.133,           ******************** regressed
[1] 16 torch.float32        0.016,  0.119,  0.135,           ************