In [1]:
! nvidia-smi -L | cut -d '(' -f 1

GPU 0: GeForce RTX 2070 SUPER 


In [1]:
import torch
import time
import itertools

nb = 500

def main(s: str):
    def prof(b_, n_, dtype, f):
        # print(b_, n_)
        x = torch.randn(*b_, n_, n_, device='cuda', dtype=dtype)

        xc = x.clone().cpu()

        t1 = time.time()
        for _ in range(nb):
            yc = torch.inverse(xc)
        t2 = time.time()
        cpu_time = (t2-t1)/nb*1e3
        # print('cpu', cpu_time, 'ms')

        for _ in range(nb):
            y = torch.inverse(x)
        torch.cuda.synchronize()

        c, d = torch.testing._compare_tensors_internal(xc.cuda(), x, rtol=1e-7, atol=1e-7, equal_nan=False)
        if not c:
            print('original matrix compare')
            print(d)
            raise RuntimeError('original value modified')

        torch.cuda.synchronize()

        t1 = time.time()
        for _ in range(nb):
            y = torch.inverse(x)
        torch.cuda.synchronize()
        t2 = time.time()
        gpu_time = (t2-t1)/nb*1e3
        # print('gpu', gpu_time, 'ms')

        a, b = torch.testing._compare_tensors_internal(yc.cuda(), y, rtol=1e-3, atol=1e-3, equal_nan=False)
        if not a:
            print('numerical mismatch: inverse value compare')
            print(b)

        print(f'{b_} {n_} {dtype}'.ljust(35) + f'{cpu_time : .3f}  {gpu_time : .3f}')
        f.write(f'{b_} {n_} {dtype}; ' + f'{cpu_time : .3e}, {gpu_time : .3e}\n')
        torch.cuda.synchronize()
    
    print(s)
    print(torch.__version__)
    print()
    print('batch_size, matrix_size, dtype'.ljust(35) + 'cpu_time(ms), gpu_time(ms)')
    
    shapes = itertools.product(
        [[]] + [[2**x] for x in range(11)],
        [2**i for i in range(1, 11)],
        [torch.float]
    )

    with open(s+'.txt', 'w') as f:
        for b, n, dtype in shapes:
            if len(b) > 0 and b[0] * n >= 2**15:
                continue
            prof(b, n, dtype, f)


In [3]:
main('before')

before
1.7.0a0+05f0053

batch_size, matrix_size, dtype     cpu_time(ms), gpu_time(ms)
[] 2 torch.float32                  0.010   7.257
[] 4 torch.float32                  0.010   7.281
[] 8 torch.float32                  0.012   7.333
[] 16 torch.float32                 0.017   7.382
[] 32 torch.float32                 0.034   7.457
[] 64 torch.float32                 0.071   8.207
[] 128 torch.float32                0.361   8.005
[] 256 torch.float32                1.062   11.516
[] 512 torch.float32                5.277   14.600
[] 1024 torch.float32               17.885   19.154
[1] 2 torch.float32                 0.010   0.111
[1] 4 torch.float32                 0.010   0.117
[1] 8 torch.float32                 0.011   0.114
[1] 16 torch.float32                0.016   0.121
[1] 32 torch.float32                0.033   0.177
[1] 64 torch.float32                0.071   0.421
[1] 128 torch.float32               0.281   0.799
[1] 256 torch.float32               1.080   1.662
[1] 512 to

In [2]:
main('after')

after
1.7.0a0+c98b599

batch_size, matrix_size, dtype     cpu_time(ms), gpu_time(ms)
[] 2 torch.float32                  0.010   0.116
[] 4 torch.float32                  0.011   0.120
[] 8 torch.float32                  0.012   0.128
[] 16 torch.float32                 0.016   0.122
[] 32 torch.float32                 0.032   0.178
[] 64 torch.float32                 0.073   0.259
[] 128 torch.float32                0.375   0.479
[] 256 torch.float32                1.113   1.054
[] 512 torch.float32                4.974   2.568
[] 1024 torch.float32               19.628   6.901
[1] 2 torch.float32                 0.009   0.169
[1] 4 torch.float32                 0.010   0.160
[1] 8 torch.float32                 0.011   0.162
[1] 16 torch.float32                0.016   0.158
[1] 32 torch.float32                0.032   0.208
[1] 64 torch.float32                0.068   0.292
[1] 128 torch.float32               0.344   0.521
[1] 256 torch.float32               1.074   1.097
[1] 512 torch.

[1024] 8 torch.float32              1.001   0.105
[1024] 16 torch.float32             4.317   0.173


In [3]:
import re

def readfile(fn):
    with open(fn, 'r') as f:
        fl = f.readlines()
    
    dc = {}
    dg = {}
    for _line in fl:
        key, cpu_time, gpu_time = re.split(';|,', _line.rstrip())
        dc[key] = float(cpu_time)
        dg[key] = float(gpu_time)
    
    return (dc, dg)

def compare():
    print('shape'.ljust(26), 'cpu_time, gpu_time_before (magma), gpu_time_after (cusolver/cublas)')
    dc_b, dg_b = readfile('before.txt')
    dc_a, dg_a = readfile('after.txt')
    
    for key in dc_b:
        cpu_time = 0.5 * (dc_b[key] + dc_a[key])
        gpu_time_before = dg_b[key]
        gpu_time_after = dg_a[key]
        
        if gpu_time_after > gpu_time_before:
            gs = '*' * 20 + ' regressed'
        else:
            gs = ''

        print(f'{key: <26} {cpu_time: .3f}, {gpu_time_before: .3f}, {gpu_time_after: .3f} {gs}')

compare()

shape                      cpu_time, gpu_time_before (magma), gpu_time_after (cusolver/cublas)
[] 2 torch.float32          0.010,  7.257,  0.116 
[] 4 torch.float32          0.010,  7.281,  0.120 
[] 8 torch.float32          0.012,  7.333,  0.128 
[] 16 torch.float32         0.016,  7.382,  0.122 
[] 32 torch.float32         0.033,  7.457,  0.178 
[] 64 torch.float32         0.072,  8.207,  0.259 
[] 128 torch.float32        0.368,  8.005,  0.478 
[] 256 torch.float32        1.087,  11.520,  1.054 
[] 512 torch.float32        5.126,  14.600,  2.568 
[] 1024 torch.float32       18.755,  19.150,  6.901 
[1] 2 torch.float32         0.009,  0.111,  0.169 ******************** regressed
[1] 4 torch.float32         0.010,  0.117,  0.160 ******************** regressed
[1] 8 torch.float32         0.011,  0.114,  0.162 ******************** regressed
[1] 16 torch.float32        0.016,  0.121,  0.158 ******************** regressed
[1] 32 torch.float32        0.033,  0.177,  0.208 *****************