In [1]:
import wgpu
from functools import reduce
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import math
import time
from tqdm import tqdm

In [2]:
adapter_0 = wgpu.gpu.request_adapter_sync(power_preference="high-performance")
device_0 = adapter_0.request_device_sync(
    required_features=[wgpu.FeatureName.timestamp_query]
)
pprint(adapter_0.info)

adapter_1 = wgpu.gpu.request_adapter_sync(power_preference="low-power")
device_1 = adapter_1.request_device_sync(
    required_features=[wgpu.FeatureName.timestamp_query]
)
pprint(adapter_1.info)

{'adapter_type': 'IntegratedGPU',
 'architecture': '',
 'backend_type': 'Vulkan',
 'description': '24.12.1 (AMD proprietary shader compiler)',
 'device': 'AMD Radeon(TM) Graphics',
 'device_id': 5761,
 'vendor': 'AMD proprietary driver',
 'vendor_id': 4098}
{'adapter_type': 'IntegratedGPU',
 'architecture': '',
 'backend_type': 'Vulkan',
 'description': '24.12.1 (AMD proprietary shader compiler)',
 'device': 'AMD Radeon(TM) Graphics',
 'device_id': 5761,
 'vendor': 'AMD proprietary driver',
 'vendor_id': 4098}


In [3]:
if 1:
    adapter = adapter_0
    device = device_0
else:
    adapter = adapter_1
    device = device_1

pprint(adapter.info)

{'adapter_type': 'IntegratedGPU',
 'architecture': '',
 'backend_type': 'Vulkan',
 'description': '24.12.1 (AMD proprietary shader compiler)',
 'device': 'AMD Radeon(TM) Graphics',
 'device_id': 5761,
 'vendor': 'AMD proprietary driver',
 'vendor_id': 4098}


In [4]:
# x, y, z
total_cells = 1024*16

x0_cpu = np.zeros((total_cells,), dtype=np.float32)
x1_cpu = np.zeros((total_cells,), dtype=np.float32)
x2_cpu = np.zeros((total_cells,), dtype=np.float32)
y_cpu = np.zeros((total_cells,), dtype=np.float32)

x0_cpu[:] = (-1.0*np.arange(total_cells, dtype=np.float32) + 0.5) % 1.28490
x1_cpu[:] = (-1.0*np.arange(total_cells, dtype=np.float32) - 0.2) % 0.847
x2_cpu[:] = (-2.0*np.arange(total_cells, dtype=np.float32) - 1.2) % 4.1

# Create buffer objects, input buffer is mapped.
x0_gpu = device.create_buffer_with_data(data=x0_cpu.data, usage=wgpu.BufferUsage.STORAGE)
x1_gpu = device.create_buffer_with_data(data=x1_cpu.data, usage=wgpu.BufferUsage.STORAGE)
x2_gpu = device.create_buffer_with_data(data=x2_cpu.data, usage=wgpu.BufferUsage.STORAGE)
y_gpu = device.create_buffer_with_data(
    data=y_cpu.data,
    usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_SRC
)
y_gpu_readback = device.create_buffer(
    size=y_cpu.data.nbytes,
    usage=wgpu.BufferUsage.MAP_READ | wgpu.BufferUsage.COPY_DST
)


# Setup layout and bindings
binding_layouts = [
    {
        "binding": 0,
        "visibility": wgpu.ShaderStage.COMPUTE,
        "buffer": {
            "type": wgpu.BufferBindingType.read_only_storage,
        },
    },
    {
        "binding": 1,
        "visibility": wgpu.ShaderStage.COMPUTE,
        "buffer": {
            "type": wgpu.BufferBindingType.read_only_storage,
        },
    },
    {
        "binding": 2,
        "visibility": wgpu.ShaderStage.COMPUTE,
        "buffer": {
            "type": wgpu.BufferBindingType.read_only_storage,
        },
    },
    {
        "binding": 3,
        "visibility": wgpu.ShaderStage.COMPUTE,
        "buffer": {
            "type": wgpu.BufferBindingType.storage,
        },
    },
]

# Put everything together
bind_group_layout = device.create_bind_group_layout(entries=binding_layouts)
pipeline_layout = device.create_pipeline_layout(bind_group_layouts=[bind_group_layout])

In [12]:
# create shader
workgroup_size = 64
loop_count = 1
dispatch_size = int(total_cells / (workgroup_size * loop_count))
global_size = dispatch_size*workgroup_size*loop_count
print(f"total_cells={total_cells}")
print(f"workgroup_size={workgroup_size}")
print(f"dispatch_size={dispatch_size}")
print(f"global_size={global_size}")
assert global_size == total_cells

work_per_cell = 512
flops_per_work = 1

shader_source = f"""
@group(0) @binding(0)
var<storage,read> x0: array<f32>;

@group(0) @binding(1)
var<storage,read> x1: array<f32>;

@group(0) @binding(2)
var<storage,read> x2: array<f32>;

@group(0) @binding(3)
var<storage,read_write> y: array<f32>;

@compute
@workgroup_size({workgroup_size})
fn main(@builtin(global_invocation_id) i0: vec3<u32>) {{
    let i: u32 = i0.x*{loop_count};
    for (var j: u32 = 0; j < {loop_count}; j = j+1) {{
        let k: u32 = i+j;
        var v: f32 = 0.0;
        for (var l: u32 = 0; l < {work_per_cell}; l = l+1) {{
            let v0: f32 = x0[k]*x1[k] + x2[k];
            v += v0;
        }}
        y[k] = v;
    }}
}}
"""

cshader = device.create_shader_module(code=shader_source)

# Create and run the pipeline
compute_pipeline = device.create_compute_pipeline(
    layout=pipeline_layout,
    compute={"module": cshader, "entry_point": "main"},
)

"""
Create a QuerySet to store the 'beginning_of_pass' and 'end_of_pass' timestamps.
Set the 'count' parameter to 2, as this set will contain 2 timestamps.
"""
# query_count = 6
query_count = 2
query_set = device.create_query_set(type=wgpu.QueryType.timestamp, count=query_count)

query_buf = device.create_buffer(
    size=8*query_set.count,
    usage=wgpu.BufferUsage.QUERY_RESOLVE | wgpu.BufferUsage.COPY_SRC,
)

profile_gpu = True

if profile_gpu:
    timestamp_writes = {
        "query_set": query_set,
        "beginning_of_pass_write_index": 0,
        "end_of_pass_write_index": 1,
    }
else:
    timestamp_writes = None

bindings = [
    {
        "binding": 0,
        "resource": {"buffer": x0_gpu, "offset": 0, "size": x0_gpu.size},
    },
    {
        "binding": 1,
        "resource": {"buffer": x1_gpu, "offset": 0, "size": x1_gpu.size},
    },
    {
        "binding": 2,
        "resource": {"buffer": x2_gpu, "offset": 0, "size": x2_gpu.size},
    },
    {
        "binding": 3,
        "resource": {"buffer": y_gpu, "offset": 0, "size": y_gpu.size},
    },
]
bind_group = device.create_bind_group(layout=bind_group_layout, entries=bindings)


# Pass our QuerySet and the indices into it, where the timestamps will be written.
gpu_samples = []
max_samples = 8192*32
min_samples = 16
sampling_timeout_ms = 10000

start_ns = time.time_ns()
for sample_idx in tqdm(range(max_samples)):
    command_encoder = device.create_command_encoder()
    compute_pass_0 = command_encoder.begin_compute_pass(timestamp_writes=timestamp_writes)
    compute_pass_0.set_pipeline(compute_pipeline)
    compute_pass_0.set_bind_group(0, bind_group)
    compute_pass_0.dispatch_workgroups(dispatch_size)
    compute_pass_0.end()
    
    # Resolve our queries, and store the results in the destination buffer we created above.
    if profile_gpu:
        command_encoder.resolve_query_set(
            query_set=query_set,
            first_query=0,
            query_count=query_count,
            destination=query_buf,
            destination_offset=0,
        )
        command_encoder.copy_buffer_to_buffer(
            source=y_gpu,
            source_offset=0,
            destination=y_gpu_readback,
            destination_offset=0,
            size=y_cpu.data.nbytes
        )
    device.queue.submit([command_encoder.finish()])

    if profile_gpu:
        timestamps_ns = device.queue.read_buffer(query_buf).cast("Q").tolist()
        gpu_delta_ns = timestamps_ns[1] - timestamps_ns[0]
        gpu_samples.append(gpu_delta_ns)
    
    total_ns = time.time_ns() - start_ns
    if sample_idx >= min_samples and total_ns > sampling_timeout_ms*1e6:
        print("Exceeded timeout limit for sample collection")
        break
end_ns = time.time_ns()
loop_ns = end_ns - start_ns
print("Finished submitting commands")

print("Waiting for queue to finish")
start_ns = time.time_ns()
device.queue.on_submitted_work_done_sync()
end_ns = time.time_ns()
final_sync_ns = end_ns - start_ns
print("Queue is flushed")

total_samples = sample_idx + 1
loop_ns_avg = loop_ns / total_samples
loop_cell_rate = total_cells / (loop_ns_avg*1e-9)
loop_flops = loop_cell_rate*work_per_cell*flops_per_work
print("=== Loop metrics ===")
print(f"total_samples={total_samples}")
print(f"loop_avg={loop_ns_avg*1e-3:.3f} us")
print(f"loop_cell_rate={loop_cell_rate*1e-6:.3f} M/s")
print(f"loop_flops={loop_flops*1e-9:.3f} GFlops")

net_ns = loop_ns + final_sync_ns
net_ns_avg = net_ns / total_samples
net_cell_rate = total_cells / (net_ns_avg*1e-9)
net_flops = net_cell_rate*work_per_cell*flops_per_work
print("=== Net metrics ===")
print(f"final_sync={final_sync_ns*1e-3:.3f} us")
print(f"net_avg={net_ns_avg*1e-3:.3f} us")
print(f"net_cell_rate={net_cell_rate*1e-6:.3f} M/s")
print(f"net_flops={net_flops*1e-9:.3f} GFlops")

gpu_delta_ns = np.array(gpu_samples)
gpu_delta_ns_avg = np.mean(gpu_delta_ns)
gpu_cell_rate = total_cells / (gpu_delta_ns_avg*1e-9)
gpu_flops = gpu_cell_rate*work_per_cell*flops_per_work

print("=== GPU metrics ===")
print(f"gpu_delta_avg={gpu_delta_ns_avg*1e-3:.3f} us")
print(f"gpu_cell_rate={gpu_cell_rate*1e-6:.3f} M/s")
print(f"gpu_flops={gpu_flops*1e-12:.3f} TFlops")

total_cells=16384
workgroup_size=64
dispatch_size=256
global_size=16384


  4%|▎         | 9711/262144 [00:10<04:19, 971.00it/s] 

Exceeded timeout limit for sample collection
Finished submitting commands
Waiting for queue to finish
Queue is flushed
=== Loop metrics ===
total_samples=9712
loop_avg=1029.758 us
loop_cell_rate=15.911 M/s
loop_flops=8.146 GFlops
=== Net metrics ===
final_sync=0.000 us
net_avg=1029.758 us
net_cell_rate=15.911 M/s
net_flops=8.146 GFlops
=== GPU metrics ===
gpu_delta_avg=1.348 us
gpu_cell_rate=12150.964 M/s
gpu_flops=6.221 TFlops





In [13]:
y_gpu_readback.map_sync(wgpu.MapMode.READ, 0, y_cpu.data.nbytes)
y_cpu_calc_memview = y_gpu_readback.read_mapped(buffer_offset=0, size=y_cpu.data.nbytes, copy=True)
y_cpu_calc = np.frombuffer(y_cpu_calc_memview, dtype=np.float32)
y_cpu_calc = np.reshape(y_cpu_calc, (total_cells,))
y_gpu_readback.unmap()

In [14]:
def cpu_shader(x0, x1, x2, y):
    for i in range(work_per_cell):
        y[:] += (x0*x1 + x2)

# Calculate the result on the CPU for comparison
y_cpu_pred = np.zeros(y_cpu.shape, dtype=y_cpu.dtype)

cpu_samples = []
total_ns = 0
for sample_idx in tqdm(range(total_samples)):
    y_cpu_pred[:] = 0.0
    start_ns = time.perf_counter_ns()
    cpu_shader(x0_cpu, x1_cpu, x2_cpu, y_cpu_pred)
    end_ns = time.perf_counter_ns()
    delta_ns = end_ns - start_ns
    cpu_samples.append(delta_ns)
    total_ns += delta_ns
    if sample_idx >= min_samples and total_ns > sampling_timeout_ms*1e6:
        print("Exceeded timeout limit for sample collection")
        break
    
cpu_delta_ns = np.array(cpu_samples)
cpu_delta_ns_avg = np.mean(cpu_delta_ns)
print(f"cpu_delta_avg={cpu_delta_ns_avg*1e-3:.3f} us")
cpu_cell_rate = total_cells / (cpu_delta_ns_avg*1e-9)
print(f"cpu_cell_rate={cpu_cell_rate*1e-6:.3f} M/s")
cpu_flops = cpu_cell_rate*work_per_cell*flops_per_work
print(f"cpu_flops={cpu_flops*1e-9:.3f} GFlops")
print(f"gpu/cpu = {gpu_cell_rate/cpu_cell_rate:.2f}x")


# Ensure results are the same
error = y_cpu_calc - y_cpu_pred
error_abs = np.abs(error)
error_avg = np.mean(error) 
error_max = np.max(error)
error_min = np.min(error)
error_abs_avg = np.mean(error_abs)

print(f"error_min={error_min:.3e}")
print(f"error_max={error_max:.3e}")
print(f"error_avg={error_avg:.3e}")
print(f"error_abs_avg={error_abs_avg:.3e}")

n_read = 5
print(y_cpu_calc[:n_read])
print(y_cpu_pred[:n_read])

 29%|██▊       | 2787/9712 [00:10<00:25, 275.73it/s]

Exceeded timeout limit for sample collection
cpu_delta_avg=3587.706 us
cpu_cell_rate=4.567 M/s
cpu_flops=2.338 GFlops
gpu/cpu = 2660.77x
error_min=-1.782e-02
error_max=1.733e-02
error_avg=1.815e-05
error_abs_avg=4.025e-03
[1650.4315   659.32306 1722.7794   518.7187  1593.5553 ]
[1650.4365  659.3222 1722.7683  518.7187 1593.5613]



