性能测试
RTX 4090

In [1]:
using CUDA
using MAT
using LinearAlgebra
using Random
using Statistics
using NPZ
using Printf
using Dates
using CSV
using DataFrames

# ==== 模型参数 ====
g_to_kg = 1e-3
u      = 4.0
Ry, ry = 0.306, 0.885
Rz, rz = 0.072, 1.021

domain_size = 40000  # m

# ==== CUDA 核函数 ====
function kernel_total_conc_add!(
    C, nx, ny, dx, dy,
    sources_x, sources_y, sources_q,
    u, Ry, ry, Rz, rz
)
    ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    if ix > nx || iy > ny
        return
    end

    xpos = (ix - 1) * dx
    ypos = (iy - 1) * dy

    c = zero(eltype(C))
    ns = length(sources_x)
    @inbounds for k in 1:ns
        sx = sources_x[k]
        sy = sources_y[k]
        q  = sources_q[k]

        δy = sy - ypos
        if δy <= 0
            continue
        end

        σy = Ry * δy^ry
        σz = Rz * δy^rz

        term1 = q / (π * u * σy * σz)
        term2 = exp(-((xpos - sx)^2) / (2 * σy^2))
        c += term1 * term2
    end

    C[iy, ix] += c
    return nothing
end

# ==== 构建源列表 ====
function build_sources(source_map; offset_x=0.0, offset_y=0.0, T=Float32)
    sx_list = T[]
    sy_list = T[]
    sq_list = T[]
    @inbounds for j in 1:size(source_map, 2), i in 1:size(source_map, 1)
        q_gps = source_map[i, j]
        if q_gps == 0
            continue
        end
        q = T(q_gps * g_to_kg)  # g/s → kg/s
        push!(sx_list, T((i - 1 + rand()) + offset_x))
        push!(sy_list, T((j - 1 + rand()) + offset_y))
        push!(sq_list, q)
    end
    return (sx_list, sy_list, sq_list)
end

# ==== 模拟（只负责计算）====
function run_simulation(full_map, res::Int, precision::DataType; device::Symbol=:gpu)
    T = precision
    nx = domain_size ÷ res
    ny = nx

    @assert size(full_map) == (domain_size, domain_size) "❌ 输入尺寸应为 $(domain_size)x$(domain_size)"

    sx, sy, sq = build_sources(full_map; T=T)

    if device == :gpu
        C = CUDA.zeros(T, nx, ny)
        threads = (16, 16)
        blocks = (cld(nx, threads[1]), cld(ny, threads[2]))

        batch_size = 1_000_000
        total_sources = length(sx)
        total_batches = ceil(Int, total_sources / batch_size)

        for batch in 1:total_batches
            range = (batch-1)*batch_size+1 : min(batch*batch_size, total_sources)
            sources_x = CuArray(sx[range])
            sources_y = CuArray(sy[range])
            sources_q = CuArray(sq[range])

            @cuda threads=threads blocks=blocks kernel_total_conc_add!(
                C, nx, ny, T(res), T(res),
                sources_x, sources_y, sources_q,
                T(u), T(Ry), T(ry), T(Rz), T(rz)
            )
            synchronize()
        end
    else
        C = zeros(T, nx, ny)
        for ix in 1:nx, iy in 1:ny
            xpos = (ix - 1) * res
            ypos = (iy - 1) * res
            c = zero(T)
            for k in 1:length(sx)
                δy = sy[k] - ypos
                if δy <= 0
                    continue
                end
                σy = Ry * δy^ry
                σz = Rz * δy^rz
                term1 = sq[k] / (π * u * σy * σz)
                term2 = exp(-((xpos - sx[k])^2) / (2 * σy^2))
                c += term1 * term2
            end
            C[iy, ix] = c
        end
    end
    return C
end

# ==== Benchmark 实验 ====
res_list = [10, 20, 50, 100]
prec_list = [Float32, Float64]
device_list = [:gpu]

results = DataFrame(
    exp_id=Int[],
    resolution=Int[],
    precision=String[],
    device=String[],
    compute_time_s=Float64[],
    total_time_s=Float64[]
)

input_path = "/root/autodl-tmp/output_sources/200m.npy"

exp_id = 0
for res in res_list, prec in prec_list, dev in device_list
    exp_id += 1
    println("▶️ Running experiment $exp_id / 16: res=$res m, precision=$(prec), device=$dev")

    # ---- 总时间（含I/O）开始 ----
    total_start = time_ns()

    # 读输入
    full_map = npzread(input_path)

    # ---- 计算部分计时 ----
    compute_start = time_ns()
    C = run_simulation(full_map, res, prec; device=dev)
    if dev == :gpu
        CUDA.synchronize()   # 确保 GPU 完全计算完
    end
    compute_elapsed = (time_ns() - compute_start) / 1e9

    # 写输出
    npzwrite("result_$exp_id.npy", Array(C))

    # ---- 总时间（含I/O）结束 ----
    total_elapsed = (time_ns() - total_start) / 1e9

    # 存储结果
    push!(results, (exp_id, res, string(prec), string(dev), compute_elapsed, total_elapsed))
end

CSV.write("benchmark_results.csv", results)
println("✅ 所有实验完成，结果已保存到 benchmark_results.csv")

▶️ Running experiment 1 / 16: res=10 m, precision=Float32, device=gpu
▶️ Running experiment 2 / 16: res=10 m, precision=Float64, device=gpu
▶️ Running experiment 3 / 16: res=20 m, precision=Float32, device=gpu
▶️ Running experiment 4 / 16: res=20 m, precision=Float64, device=gpu
▶️ Running experiment 5 / 16: res=50 m, precision=Float32, device=gpu
▶️ Running experiment 6 / 16: res=50 m, precision=Float64, device=gpu
▶️ Running experiment 7 / 16: res=100 m, precision=Float32, device=gpu
▶️ Running experiment 8 / 16: res=100 m, precision=Float64, device=gpu
✅ 所有实验完成，结果已保存到 benchmark_results.csv
