In [1]:
import torch

print('Pytorch version\t:', torch.__version__)
print('CUDA version\t:', torch.version.cuda)
print('GPU\t\t:',torch.cuda.get_device_name())

Pytorch version	: 1.10.2
CUDA version	: 11.3
GPU		: NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
import inspect
from collections import defaultdict
import pandas as pd
from torch.utils import benchmark 

pd.options.display.precision = 3

def var_dict(*args):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return dict([(name, val) for name, val in callers_local_vars if val is arg][0] 
                for arg in args)

def walltime(stmt, arg_dict, duration=3):
    return benchmark.Timer(stmt=stmt, globals=arg_dict).blocked_autorange(
        min_run_time=duration).median

In [3]:
from IPython.display import clear_output

!git clone https://github.com/huggingface/transformers
!cd transformers; pip install .

clear_output()

In [4]:
matmul_tflops = defaultdict(lambda: {})
for n in [128, 512, 2048, 8192]:
    for dtype in (torch.float32, torch.float16):
        a = torch.randn(n, n, dtype=dtype).cuda()
        b = torch.randn(n, n, dtype=dtype).cuda()   
        t = walltime('a @ b', var_dict(a, b))
        matmul_tflops[f'n={n}'][dtype] = 2*n**3 / t / 1e12
        del a, b
        
pd.DataFrame(matmul_tflops)

Unnamed: 0,n=128,n=512,n=2048,n=8192
torch.float32,0.085,5.844,7.148,7.032
torch.float16,0.09,6.976,14.54,14.356


In [5]:
vector = defaultdict(lambda: {})
for n in [1024*64, 1024*256, 1024*1024, 1024*1024*4]:
    a = torch.randn(n).cuda()
    t = walltime('a * 1.2', var_dict(a))
    vector[n]['TFLOPS'] = n / t / 1e12
    vector[n]['GB/s'] = 8 * n / t / 1e9
    
pd.DataFrame(vector)

Unnamed: 0,65536,262144,1048576,4194304
TFLOPS,0.002,0.015,0.021,0.023
GB/s,16.967,119.409,171.319,180.478


In [6]:
from transformers import AutoConfig, BertLayer

config = AutoConfig.from_pretrained("bert-large-uncased")
layer = BertLayer(config).half().cuda()

In [7]:
def layer_benchmark(layer, hidden_size, seq_lens, batch_sizes, cross_attention=False):
    h = hidden_size
    results = defaultdict(lambda: {})    
    encoder_state = 'encoder_hidden_states=X' if cross_attention else ''
    for s in seq_lens:
        for b in batch_sizes:            
            ffn = 16*b*s*h*h / 1e12  # TFLOPS for the Feed-Forward Network
            atten = (4*b*h*s*s + 8*b*s*h*h) / 1e12  # TFLOPS for attention            
            forward = ffn + (2 if cross_attention else 1) * atten
            
            X = torch.randn(b, s, h).half().cuda()
            results[f'batch={b}'][f'fwd seq_len={s}'] = forward / walltime(
                f'layer(X, {encoder_state})', var_dict(layer, X))
            results[f'batch={b}'][f'fwd+bwd seq_len={s}'] = 3 * forward / walltime(
                f'layer(X, {encoder_state})[0].sum().backward()', var_dict(layer, X))            
    return pd.DataFrame(results)

In [8]:
layer_benchmark(layer, config.hidden_size, [128, 512], [2, 4, 8, 16, 32, 64, 128])

Unnamed: 0,batch=2,batch=4,batch=8,batch=16,batch=32,batch=64,batch=128
fwd seq_len=128,4.118,8.658,8.315,8.918,8.714,9.005,8.765
fwd+bwd seq_len=128,4.001,7.915,9.515,10.458,10.639,10.954,10.902
fwd seq_len=512,7.308,7.761,7.638,7.861,7.684,2.26,1.091
fwd+bwd seq_len=512,8.372,9.078,9.232,9.479,9.447,2.54,1.05


In [9]:
h, b, s = config.hidden_size, 64, 128
X = torch.randn(b, s, h).half().cuda()
Dense = 8*b*s*h*h / 1e12 / walltime(    
    'layer.intermediate.dense(X)', var_dict(layer, X))

In [10]:
DenseActivation = 8*b*s*h*h / 1e12 / walltime(
    'layer.intermediate(X)', var_dict(layer, X))

In [11]:
ffn = 16*b*s*h*h / 1e12
FFN = ffn / walltime(
    'layer.output(layer.intermediate(X),X)', var_dict(layer, X))

In [12]:
att = (4*b*h*s*s + 8*b*s*h*h) / 1e12
Attention = att / walltime('layer.attention(X)', var_dict(layer, X))

In [13]:
data = {'Layer': ['Dense', 'Dense+Activation', 'FFN', 'Attention'],
        'TFLOPS': [Dense, DenseActivation, FFN, Attention]}
df = pd.DataFrame(data)
print(df)

att / ffn

              Layer  TFLOPS
0             Dense  11.902
1  Dense+Activation  10.282
2               FFN  10.529
3         Attention   7.070


0.53125

In [14]:
from transformers.models.gpt2.modeling_gpt2 import GPT2Block

config = AutoConfig.from_pretrained("gpt2-medium")
layer = GPT2Block(config, layer_idx=0).half().cuda()
layer_benchmark(layer, config.n_embd, [512, 1024], [2, 4, 8, 16, 32, 64])

Unnamed: 0,batch=2,batch=4,batch=8,batch=16,batch=32,batch=64
fwd seq_len=512,5.908,6.135,6.22,6.235,6.253,1.3
fwd+bwd seq_len=512,6.498,6.937,6.986,7.11,7.143,1.221
fwd seq_len=1024,5.278,5.329,5.338,1.113,0.776,0.402
fwd+bwd seq_len=1024,6.007,6.043,6.138,1.155,0.762,0.431


In [15]:
from transformers.models.t5.modeling_t5 import T5Block

config = AutoConfig.from_pretrained("t5-large")
config.use_cache = False
config.is_decoder = False
config.is_encoder_decoder = False

encoder = T5Block(config).half().cuda()
layer_benchmark(encoder, config.d_model, [512], [2, 4, 8, 16, 32, 64, 128])

Unnamed: 0,batch=2,batch=4,batch=8,batch=16,batch=32,batch=64,batch=128
fwd seq_len=512,5.092,0.925,1.382,0.661,0.48,0.475,0.503
fwd+bwd seq_len=512,2.862,0.701,2.301,0.567,0.492,0.493,0.512


In [16]:
config.is_decoder = True
decoder = T5Block(config).half().cuda()
layer_benchmark(decoder, config.d_model, [128], [2, 4, 8, 16, 32, 64, 128], cross_attention=True)

Unnamed: 0,batch=2,batch=4,batch=8,batch=16,batch=32,batch=64,batch=128
fwd seq_len=128,2.317,4.564,3.739,0.708,0.94,1.671,0.638
fwd+bwd seq_len=128,2.463,4.941,2.325,0.672,1.035,1.148,0.581
