# 测试 Roofline 模型

In [1]:
import set_env

In [2]:
import csv
import json
import os
import platform
from io import StringIO

import numpy as np

import tvm.testing
import tvm.utils
from tvm import relay, rpc
from tvm.contrib import utils
from tvm.contrib.debugger import debug_executor
from tvm.relay.testing import mlp
from tvm.runtime import profiler_vm
from tvm.runtime.profiling import Report
from tvm.script import tir as T

## estimate_peak_flops_cpu

In [3]:
for dtype in ["float32", "int8", "int32"]:
    server = rpc.Server(key="roofline_flops_cpu")
    remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_cpu")
    target = tvm.target.Target("llvm -mattr=+fma,+avx2")
    dev = remote.device(str(target))
    # This test uses vectorized instructions so we need a target that supports them
    flops = tvm.utils.roofline.x86.estimate_peak_fma_vector_flops(target, dev, remote, dtype)
    # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
    assert (
        flops > 10**9 and flops < 10**14
    ), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"

2024-01-19 13:33:37.135 INFO bind to 0.0.0.0:9091
2024-01-19 13:33:37.137 INFO connected from ('127.0.0.1', 57510)
2024-01-19 13:33:37.138 INFO start serving at /tmp/tmpfh9tnqax
2024-01-19 13:33:37.311 INFO load_module /tmp/tmpfh9tnqax/peak_fma_flops.tar
2024-01-19 13:33:38.785 INFO bind to 0.0.0.0:9092
2024-01-19 13:33:38.838 INFO connected from ('127.0.0.1', 33856)
2024-01-19 13:33:38.839 INFO start serving at /tmp/tmprw44z7bx
2024-01-19 13:33:38.987 INFO load_module /tmp/tmprw44z7bx/peak_fma_flops.tar
2024-01-19 13:33:42.105 INFO bind to 0.0.0.0:9091
2024-01-19 13:33:42.141 INFO connected from ('127.0.0.1', 50332)
2024-01-19 13:33:42.142 INFO start serving at /tmp/tmph5er4tvd
2024-01-19 13:33:42.277 INFO load_module /tmp/tmph5er4tvd/peak_fma_flops.tar


## estimate_peak_flops_gpu

In [4]:
from tvm_book.config.env import set_cudnn
set_cudnn() # 设置 CUDA 环境

In [5]:
server = rpc.Server(key="roofline_flops_gpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_gpu")
target = tvm.target.Target("cuda")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
flops = tvm.utils.roofline.cuda.estimate_peak_flops_tensorcore(target, dev, remote)
# should be able to hit a TFLOP/s with tensor cores
assert (
    flops > 10**12 and flops < 10**14
), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"

# this test should run on all gpus
flops = tvm.utils.roofline.cuda.estimate_peak_flops_fma(target, dev, remote, "float32")
# most gpus since 2016 should be able to hit a TFLOP/s with fma instructions
assert (
    flops > 10**12 and flops < 10**14
), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"

2024-01-19 13:33:44.614 INFO bind to 0.0.0.0:9092
2024-01-19 13:33:44.668 INFO connected from ('127.0.0.1', 33872)
2024-01-19 13:33:44.669 INFO start serving at /tmp/tmp4myep303


nvcc --fatbin -O3 -gencode arch=compute_86,code=sm_86 -o /tmp/tmphkhc1un6/tvm_kernels.fatbin /tmp/tmphkhc1un6/tvm_kernels.cu


2024-01-19 13:33:45.579 INFO load_module /tmp/tmp4myep303/peak_mma_flops.tar


nvcc --fatbin -O3 -gencode arch=compute_86,code=sm_86 -o /tmp/tmplrh4lpaw/tvm_kernels.fatbin /tmp/tmplrh4lpaw/tvm_kernels.cu


2024-01-19 13:33:46.439 INFO load_module /tmp/tmp4myep303/peak_fma_flops.tar


## estimate_peak_bandwidth_cpu

In [6]:
server = rpc.Server(key="roofline_bandwidth_cpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_cpu")
target = tvm.target.Target("llvm -mattr=+fma,+avx2")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
bandwidth = tvm.utils.roofline.x86.estimate_peak_bandwidth_dram(target, dev, remote)
# Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
# GB/s, so this should leave enough wiggle room.
assert (
    bandwidth > 10**9 and bandwidth < 10**12
), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"


2024-01-19 13:33:47.650 INFO bind to 0.0.0.0:9091
2024-01-19 13:33:47.705 INFO connected from ('127.0.0.1', 50348)
2024-01-19 13:33:47.706 INFO start serving at /tmp/tmpljz1k97w
2024-01-19 13:33:47.860 INFO load_module /tmp/tmpljz1k97w/peak_bandwidth.tar


## estimate_peak_bandwidth_gpu

In [None]:
server = rpc.Server(key="roofline_bandwidth_gpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_gpu")
target = tvm.target.Target("cuda")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
bandwidth = tvm.utils.roofline.cuda.estimate_peak_bandwidth_global_mem(target, dev, remote)
# should be able to hit a 100 GB/s on a GPU. GTX 280 hits 140 GB/s and
# it is really old.
assert (
    bandwidth > 10**11 and bandwidth < 10**13
), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"


## roofline_analysis

In [None]:
target, dev = "llvm -mattr=+fma,+avx2", "cuda"

In [None]:
a = relay.var("a", relay.TensorType((512, 512), "float32"))
b = relay.var("b", relay.TensorType((512, 512), "float32"))
c = relay.nn.dense(a, b)
mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
params = {}

server = rpc.Server(key="roofline")
remote = rpc.connect("127.0.0.1", server.port, key="roofline")
dev = remote.device(target)

report = tvm.utils.roofline_analysis(mod, params, target, dev, remote=remote)
print(report)

assert "Bound" in report.table()
assert "Percent of Theoretical Optimal" in report.table()
for call in report.calls:
    if "Percent of Theoretical Optimal" in call:
        if target.startswith("llvm"):
            # Ideally we'd like a little tighter bound here, but it is hard to
            # know how well this dense will perform without tuning. And we
            # don't have an operator that uses a specific number of flops.
            assert call["Percent of Theoretical Optimal"].ratio >= 5.0
        elif target == "cuda":
            # The cuda gpu kernel is really poorly optimized
            assert 90 >= call["Percent of Theoretical Optimal"].ratio >= 0.01