In [1]:
import sys
import torch
from tqdm import tqdm

torch.backends.cudnn.benchmark = True
print('torch version:', torch.__version__)
print('CUDA version:', torch.version.cuda)
print('CUDNN version:', torch.backends.cudnn.version())
!tail -n 1 /etc/lsb-release

torch version: 1.0.0a0+60e7d04
CUDA version: 10.0.130
CUDNN version: 7401
DISTRIB_DESCRIPTION="Ubuntu 16.04.5 LTS"


In [2]:
class TorchBenchmark:
    def f(self):
        with torch.no_grad():
            while True:
                self.op(self.dummy)
                torch.cuda.synchronize()
                yield self.FLOP
    
    def test(self, tflop=100):
        if self.half:
            tflop *= 4
        sys.stdout.flush()
        total = tflop * 1024 ** 4
        with tqdm(self.f(), total=total, unit='FLOP', unit_scale=True, unit_divisor=1024) as pbar:
            for x in pbar:
                if pbar.n + x > total:
                    pbar.update(total - pbar.n)
                    break
                else:
                    pbar.update(x)
        
        mean_speed = pbar.last_print_n / (pbar.last_print_t - pbar.start_t) / (1024 ** 4)
        return mean_speed
    
    def describe(self):
        print('Input shape:',self.dummy.shape)
        print('Op:', self.op)
        print(f'{self.FLOP / (1024 ** 3):.3f}GFLOP')

class TorchBenchmarkLinear(TorchBenchmark):
    def __init__(self, a, b, c, bias=False, half=False):
        super(TorchBenchmarkLinear, self).__init__()
        
        self.half = half
        self.dummy = torch.randn((1, a, b)).cuda()
        self.op = torch.nn.Linear(b, c, bias=bias).cuda()
        if half:
            self.dummy = self.dummy.half()
            self.op = self.op.half()
        self.FLOP = (b * 2) * a * c

class TorchBenchmarkConv2d(TorchBenchmark):
    def __init__(self, width, in_channels, out_channels, kernel_size, bias=False, half=False):
        super(TorchBenchmarkConv2d, self).__init__()
        self.half = half
        self.dummy = torch.randn((1, in_channels, width, width)).cuda()
        self.op = torch.nn.Conv2d(in_channels, out_channels, kernel_size, bias=bias).cuda()
        if half:
            self.dummy = self.dummy.half()
            self.op = self.op.half()
        
        width2 = width - kernel_size + 1
        self.FLOP = (kernel_size ** 2 * 2 * in_channels) * out_channels * width2 * width2

In [3]:
def test(device_id=0):
    torch.cuda.set_device(device_id)
    print(torch.cuda.get_device_properties(device_id))

    A = 2**12
    bm = TorchBenchmarkLinear(A, A, A)
    fp32_speed = bm.test()
    fp16_speed = TorchBenchmarkLinear(A, A, A, half=True).test()
    bm.describe()
    print(f'Speedup ratio: {fp16_speed / fp32_speed * 100:.2f}%')
    
    W = 2 ** 8
    C = 2 ** 8
    K = 2 ** 3
    bm = TorchBenchmarkConv2d(W, C, C, K, half=False)
    fp32_speed = bm.test()
    fp16_speed = TorchBenchmarkConv2d(W, C, C, K, half=True).test()
    bm.describe()
    print(f'Speedup ratio: {fp16_speed / fp32_speed * 100:.2f}%\n\n')

In [4]:
# warm up
for i in range(1):
    torch.cuda.set_device(i)
    
    A = 2 ** 12
    X = torch.randn((A, A)).cuda()
    torch.matmul(X, X)
    
    A = 2 ** 8
    B = 2 ** 3
    X = torch.randn((1, A, A, A)).cuda()
    torch.nn.Conv2d(A, A, B).cuda()(X)

In [5]:
for i in range(1):
    test(i)

_CudaDeviceProperties(name='Quadro RTX 5000', major=7, minor=5, total_memory=16095MB, multi_processor_count=48)


100%|██████████| 100T/100T [00:11<00:00, 9.93TFLOP/s] 
100%|██████████| 400T/400T [00:08<00:00, 51.8TFLOP/s] 


Input shape: torch.Size([1, 4096, 4096])
Op: Linear(in_features=4096, out_features=4096, bias=False)
128.000GFLOP
Speedup ratio: 521.02%


100%|██████████| 100T/100T [00:10<00:00, 10.2TFLOP/s] 
100%|██████████| 400T/400T [00:08<00:00, 51.5TFLOP/s] 

Input shape: torch.Size([1, 256, 256, 256])
Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)
484.383GFLOP
Speedup ratio: 501.91%







In [6]:
device_id = 0
torch.cuda.set_device(device_id)
print(torch.cuda.get_device_properties(device_id))

_CudaDeviceProperties(name='Quadro RTX 5000', major=7, minor=5, total_memory=16095MB, multi_processor_count=48)


In [7]:
class TorchBenchmarkMatMul(TorchBenchmark):
    def __init__(self, a, b, c, bias=False, half=False):
        super(TorchBenchmarkMatMul, self).__init__()
        
        self.half = half
        self.dummy = torch.randn((b, a)).cuda()
        self.dummy2 = torch.randn((c, b)).cuda()
        if half:
            self.dummy = self.dummy.half()
            self.dummy2 = self.dummy2.half()
        self.op = self.dummy2.matmul
        self.FLOP = a * b * c * 2

In [8]:
A = 2**12
bm = TorchBenchmarkMatMul(A, A, A, half=True)
bm.describe()
fp16_speed = bm.test()

Input shape: torch.Size([4096, 4096])
Op: <built-in method matmul of Tensor object at 0x7fad520b8e10>
128.000GFLOP


100%|██████████| 400T/400T [00:08<00:00, 51.4TFLOP/s] 


In [9]:
W = 2 ** 8
C = 2 ** 8
K = 2 ** 3
bm = TorchBenchmarkConv2d(W, C, C, K, half=True)
bm.describe()
fp16_speed = bm.test()

Input shape: torch.Size([1, 256, 256, 256])
Op: Conv2d(256, 256, kernel_size=(8, 8), stride=(1, 1), bias=False)
484.383GFLOP


100%|██████████| 400T/400T [00:08<00:00, 53.1TFLOP/s] 


In [10]:
A = 2 ** 12
X = torch.randn((A, A)).half().cuda()
Y = torch.matmul(X, X)
Y.dtype

torch.float16

In [11]:
A = 2 ** 8
B = 2 ** 3
X = torch.randn((1, A, A, A)).half().cuda()
CONV = torch.nn.Conv2d(A, A, B).half().cuda()
Y = CONV(X)
Y.dtype

torch.float16