In [1]:
import torch

By default we cannot multiply bf16 and fp32 matrices

In [2]:
mat1 = torch.randn(2048, 1024, dtype=torch.bfloat16, device='cuda')
mat2 = torch.randn(1024, 4096, dtype=torch.float32, device='cuda')

try:
    # Perform the matrix multiplication
    result = torch.matmul(mat1, mat2)
    # check dtype of the result
    print(result.dtype)  # Should be torch.float32
except Exception as e:
    print(f"Error occurred: {e}")

Error occurred: expected mat1 and mat2 to have the same dtype, but got: c10::BFloat16 != float


Within Autocast, same operation does not give error and gives bf16 output

In [3]:
amp_ctx = torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)

print("trying matrix multiplication for mat1: {mat1.dtype} and mat2: {mat2.dtype}")
try:
    result = torch.matmul(mat1, mat2)
except Exception as e:
    print(f"Error during matrix multiplication: {e}")
with amp_ctx:
    print(f"Performing matrix multiplication with autocast... for mat1: {mat1.dtype}, mat2: {mat2.dtype}")
    result_amp = torch.matmul(mat1, mat2)
print(result_amp.dtype)  

trying matrix multiplication for mat1: {mat1.dtype} and mat2: {mat2.dtype}
Error during matrix multiplication: expected mat1 and mat2 to have the same dtype, but got: c10::BFloat16 != float
Performing matrix multiplication with autocast... for mat1: torch.bfloat16, mat2: torch.float32
torch.bfloat16


Autocast policy varies by op type.<br> Layer Norm for example outputs the higher precision.<br>  See more detail here https://docs.pytorch.org/docs/stable/amp.html#cuda-op-specific-behavior

In [4]:
# create a layernorm layer , init data and run it
from torch import nn

B, L, D = 32, 2048, 1024
layernorm = nn.LayerNorm(D).to(device='cuda') # default dtype is float32

input_data = torch.randn(B, L, D, dtype=torch.bfloat16, device='cuda')

print(f"trying layernorm with input data dtype {input_data.dtype} and weight dtype {layernorm.weight.dtype}")
try:
    output_data = layernorm(input_data)
    print(f"LayerNorm output dtype: {output_data.dtype}")
except Exception as e:
    print(f"Error during LayerNorm: {e}")

print('-' * 128)

amp_ctx = torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)
print(f"Performing LayerNorm with autocast for input data dtype {input_data.dtype} and weight dtype {layernorm.weight.dtype}")
with amp_ctx:
    output_data_amp = layernorm(input_data)
print(f"LayerNorm output with autocast dtype: {output_data_amp.dtype}")

trying layernorm with input data dtype torch.bfloat16 and weight dtype torch.float32
Error during LayerNorm: expected scalar type BFloat16 but found Float
--------------------------------------------------------------------------------------------------------------------------------
Performing LayerNorm with autocast for input data dtype torch.bfloat16 and weight dtype torch.float32
LayerNorm output with autocast dtype: torch.float32


Let's see what happens under the hood

In [5]:
from torch.profiler import profile, ProfilerActivity
def profile_fn(fn, path="fn_trace.json", warmup=5, avg_steps=10):
    """
    Profile fn
    Args:
        fn (callable): The function to profile.
        warmup (int): Number of warmup iterations.
        avg_steps (int): Number of iterations to average over.
    Returns:
        str: path of the replayed events trace
    
    """

    def trace_handler(p):
        p.export_chrome_trace(path)
    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
    wait = 10
    warmup = 5
    active = 10
    with profile(
        activities=activities,
        schedule=torch.profiler.schedule(
            wait=wait,
            warmup=warmup,
            active=active,
            repeat=1
            ),
        record_shapes=True,
        on_trace_ready=trace_handler
    ) as p:
        for idx in range(wait + warmup + active):
            fn()
            p.step()

    return path

In [6]:
mat1_add = torch.randn(2048, 1024, dtype=torch.bfloat16, device='cuda')
mat2_add = torch.rand_like(mat1_add, dtype=torch.float32, device='cuda')
def autocast_fn():
    with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
        result = torch.matmul(mat1, mat2)
    return result

result = autocast_fn()
print(f"Result dtype: {result.dtype}, shape: {result.shape}")

path = profile_fn(autocast_fn, path="matmul_autocast_trace.json", warmup=5, avg_steps=10)
print(f"Autocast profiling trace saved to {path}")

Result dtype: torch.bfloat16, shape: torch.Size([2048, 4096])
Autocast profiling trace saved to matmul_autocast_trace.json




We can see that the fp32 matrix is typecast into bf16 and then the op happens in low precision

In [8]:
from TraceLens import TreePerfAnalyzer
expt_perf_analyzer = TreePerfAnalyzer.from_file(path)
evt = expt_perf_analyzer.tree.events[1]
expt_perf_analyzer.tree.traverse_subtree_and_print(evt, cpu_op_fields=('Input Dims', 'Input type'))

Building tree with add_python_func=False
Building CPU op tree with add_python_func=False
└── UID: 1, Category: cpu_op, Name: aten::matmul
    |   Input Dims: [[2048, 1024], [1024, 4096]]
    |   Input type: ['c10::BFloat16', 'float']
    |   
    ├── UID: 2, Category: cpu_op, Name: aten::to
    │   |   Input Dims: [[1024, 4096], [], [], [], []]
    │   |   Input type: ['float', 'Scalar', 'Scalar', 'Scalar', '']
    │   |   
    │   └── UID: 3, Category: cpu_op, Name: aten::_to_copy
    │       |   Input Dims: [[1024, 4096], [], [], [], [], [], []]
    │       |   Input type: ['float', 'Scalar', '', '', '', 'Scalar', '']
    │       |   
    │       └── UID: 5, Category: cpu_op, Name: aten::copy_
    │           |   Input Dims: [[1024, 4096], [1024, 4096], []]
    │           |   Input type: ['c10::BFloat16', 'float', 'Scalar']
    │           |   
    │           └── UID: 90, Category: cuda_runtime, Name: hipLaunchKernel
    │               └── UID: 210, Category: kernel, Name: void at

In [15]:
def autocast_layernorm_fn():
    with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
        output_data = layernorm(input_data)
    return output_data

path = profile_fn(autocast_layernorm_fn, path="autocast_layernorm_trace.json", warmup=5, avg_steps=10)
print(f"Autocast LayerNorm profiling trace saved to {path}")


Autocast LayerNorm profiling trace saved to autocast_layernorm_trace.json


For Layer norm on the other hand, the low precision activation is upcasted and the operation happens in high precision. The cost of upcast is roughly same is the actual layer norm. Understanding autocast is important to understand performance!

In [17]:
from TraceLens import TreePerfAnalyzer
path = "autocast_layernorm_trace.json"
expt_perf_analyzer = TreePerfAnalyzer.from_file(path, add_python_func=True)
evt = expt_perf_analyzer.tree.events[1]
expt_perf_analyzer.tree.traverse_subtree_and_print(evt, cpu_op_fields=('Input Dims', 'Input type'))

Building tree with add_python_func=True
Building CPU op tree with add_python_func=True
└── UID: 1, Category: cpu_op, Name: aten::layer_norm
    |   Input Dims: [[32, 2048, 1024], [], [1024], [1024], [], []]
    |   Input type: ['c10::BFloat16', 'ScalarList', 'float', 'float', 'Scalar', 'Scalar']
    |   
    ├── UID: 2, Category: cpu_op, Name: aten::to
    │   |   Input Dims: [[32, 2048, 1024], [], [], [], []]
    │   |   Input type: ['c10::BFloat16', 'Scalar', 'Scalar', 'Scalar', '']
    │   |   
    │   └── UID: 3, Category: cpu_op, Name: aten::_to_copy
    │       |   Input Dims: [[32, 2048, 1024], [], [], [], [], [], []]
    │       |   Input type: ['c10::BFloat16', 'Scalar', '', '', '', 'Scalar', '']
    │       |   
    │       └── UID: 5, Category: cpu_op, Name: aten::copy_
    │           |   Input Dims: [[32, 2048, 1024], [32, 2048, 1024], []]
    │           |   Input type: ['float', 'c10::BFloat16', 'Scalar']
    │           |   
    │           └── UID: 130, Category: cuda_