In [1]:
! nvidia-smi -L | cut -d '(' -f 1

GPU 0: GeForce RTX 2070 SUPER 


In [1]:
import torch
import time
import itertools

nb = 500

def main(s: str):
    def prof(b_, n_, dtype, f):
        # print(b_, n_)
        x = torch.randn(*b_, n_, n_, device='cuda', dtype=dtype)

        xc = x.clone().cpu()

        t1 = time.time()
        for _ in range(nb):
            yc = torch.inverse(xc)
        t2 = time.time()
        cpu_time = (t2-t1)/nb*1e3
        # print('cpu', cpu_time, 'ms')

        for _ in range(nb):
            y = torch.inverse(x)
        torch.cuda.synchronize()

        c, d = torch.testing._compare_tensors_internal(xc.cuda(), x, rtol=1e-7, atol=1e-7, equal_nan=False)
        if not c:
            print('original matrix compare')
            print(d)
            raise RuntimeError('original value modified')

        torch.cuda.synchronize()

        t1 = time.time()
        for _ in range(nb):
            y = torch.inverse(x)
        torch.cuda.synchronize()
        t2 = time.time()
        gpu_time = (t2-t1)/nb*1e3
        # print('gpu', gpu_time, 'ms')

        a, b = torch.testing._compare_tensors_internal(yc.cuda(), y, rtol=1e-3, atol=1e-3, equal_nan=False)
        if not a:
            print('numerical mismatch: inverse value compare')
            print(b)

        print(f'{b_} {n_} {dtype}'.ljust(35) + f'{cpu_time : .3f}  {gpu_time : .3f}')
        f.write(f'{b_} {n_} {dtype}; ' + f'{cpu_time : .3e}, {gpu_time : .3e}\n')
        torch.cuda.synchronize()
    
    print(s)
    print(torch.__version__)
    print()
    print('batch_size, matrix_size, dtype'.ljust(35) + 'cpu_time(ms), gpu_time(ms)')
    
    shapes = itertools.product(
        [[]] + [[2**x] for x in range(11)],
        [2**i for i in range(1, 11)],
        [torch.float]
    )

    with open(s+'.txt', 'w') as f:
        for b, n, dtype in shapes:
            if len(b) > 0 and b[0] * n >= 2**15:
                continue
            prof(b, n, dtype, f)


In [3]:
main('before')

before
1.7.0a0+4ae832e

batch_size, matrix_size, dtype     cpu_time(ms), gpu_time(ms)
[] 2 torch.float32                  0.018   7.370
[] 4 torch.float32                  0.011   7.422
[] 8 torch.float32                  0.011   7.436
[] 16 torch.float32                 0.056   7.513
[] 32 torch.float32                 0.179   7.616
[] 64 torch.float32                 0.205   7.721
[] 128 torch.float32                0.395   8.000
[] 256 torch.float32                1.115   11.267
[] 512 torch.float32                4.567   14.267
[] 1024 torch.float32               19.247   20.489
[1] 2 torch.float32                 0.009   0.113
[1] 4 torch.float32                 0.010   0.112
[1] 8 torch.float32                 0.011   0.116
[1] 16 torch.float32                0.016   0.128
[1] 32 torch.float32                0.032   0.179
[1] 64 torch.float32                0.072   0.427
[1] 128 torch.float32               0.353   0.806
[1] 256 torch.float32               1.214   1.687
[1] 512 to

In [2]:
main('after')

after
1.7.0a0+5d46ad2

batch_size, matrix_size, dtype     cpu_time(ms), gpu_time(ms)
[] 2 torch.float32                  0.011   0.145
[] 4 torch.float32                  0.014   0.147
[] 8 torch.float32                  0.014   0.141
[] 16 torch.float32                 0.018   0.123
[] 32 torch.float32                 0.034   0.172
[] 64 torch.float32                 0.083   0.253
[] 128 torch.float32                0.367   0.478
[] 256 torch.float32                1.142   1.060
[] 512 torch.float32                4.989   2.564
[] 1024 torch.float32               17.907   6.908
[1] 2 torch.float32                 0.010   0.148
[1] 4 torch.float32                 0.011   0.149
[1] 8 torch.float32                 0.012   0.147
[1] 16 torch.float32                0.017   0.137
[1] 32 torch.float32                0.033   0.188
[1] 64 torch.float32                0.071   0.282
[1] 128 torch.float32               0.369   0.498
[1] 256 torch.float32               1.299   1.079
[1] 512 torch.

In [3]:
import re

def readfile(fn):
    with open(fn, 'r') as f:
        fl = f.readlines()
    
    dc = {}
    dg = {}
    for _line in fl:
        key, cpu_time, gpu_time = re.split(';|,', _line.rstrip())
        dc[key] = float(cpu_time)
        dg[key] = float(gpu_time)
    
    return (dc, dg)

def compare():
    print('shape'.ljust(26), 'cpu_time, gpu_time_before (magma), gpu_time_after (cusolver/cublas)')
    dc_b, dg_b = readfile('before.txt')
    dc_a, dg_a = readfile('after.txt')
    
    for key in dc_b:
        cpu_time = 0.5 * (dc_b[key] + dc_a[key])
        gpu_time_before = dg_b[key]
        gpu_time_after = dg_a[key]
        
        if gpu_time_after > gpu_time_before:
            gs = '*' * 20 + ' regressed'
        else:
            gs = ''

        print(f'{key: <26} {cpu_time: .3f}, {gpu_time_before: .3f}, {gpu_time_after: .3f} {gs}')

compare()

shape                      cpu_time, gpu_time_before (magma), gpu_time_after (cusolver/cublas)
[] 2 torch.float32          0.014,  7.370,  0.145 
[] 4 torch.float32          0.012,  7.422,  0.147 
[] 8 torch.float32          0.013,  7.436,  0.141 
[] 16 torch.float32         0.037,  7.513,  0.123 
[] 32 torch.float32         0.106,  7.616,  0.172 
[] 64 torch.float32         0.144,  7.721,  0.253 
[] 128 torch.float32        0.381,  8.000,  0.478 
[] 256 torch.float32        1.128,  11.270,  1.060 
[] 512 torch.float32        4.778,  14.270,  2.564 
[] 1024 torch.float32       18.580,  20.490,  6.908 
[1] 2 torch.float32         0.010,  0.113,  0.148 ******************** regressed
[1] 4 torch.float32         0.010,  0.112,  0.149 ******************** regressed
[1] 8 torch.float32         0.012,  0.116,  0.147 ******************** regressed
[1] 16 torch.float32        0.016,  0.128,  0.137 ******************** regressed
[1] 32 torch.float32        0.032,  0.179,  0.188 *****************