Skip to content

Cuda memory allocation slow #2772

@rztz

Description

@rztz

Describe the bug
The Cuda backend seems to have a performance issue when keeping results in VMEM.

The following example is very slow in the first iteration (about 9 seconds).
Following iterations are fast (about 0.2 seconds)

Copying the VMEM tensor into Tensordata and back to a VMEM tensor results in a speed up of the first iteration.
(hack = true in the example)

This was tested on the current main branch.

To Reproduce

fn test_memory_alloc() {
    unsafe { libloading::Library::new("libtorch.so").unwrap() };

    // number of loops
    let loops = 8;

    test_device_memory_alloc::<burn::backend::NdArray>(
        &burn::backend::ndarray::NdArrayDevice::Cpu,
        loops,
    );

    test_device_memory_alloc::<burn::backend::Candle>(
        &burn::backend::candle::CandleDevice::cuda(0),
        loops,
    );

    test_device_memory_alloc::<burn::backend::LibTorch>(
        &burn::backend::libtorch::LibTorchDevice::Cuda(0),
        loops,
    );

    test_device_memory_alloc::<burn::backend::Cuda>(
        &burn::backend::cuda::CudaDevice::new(0),
        loops,
    );
}

fn test_device_memory_alloc<B: Backend>(device: &<B as Backend>::Device, loops: usize) {
    println!("testing backend: {:?}, device: {:?}", B::name(), device);
    const DIM: usize = 4;
    let shape = [32, 4, 84, 84];
    let l = shape.iter().product();
    let data_v = vec![0; l];

    let steps = 128;
    let hack = false;

    for i in 0..loops {
        let start = std::time::Instant::now();

        let mut in_vec = vec![];
        for _ in 0..steps {
            let td = TensorData::new(data_v.clone(), shape);
            let t: Tensor<B, DIM> = Tensor::from_data(td, device);
            in_vec.push(t);
        }

        let mut out_vec = vec![];
        for mut t in in_vec {
            t = t.add_scalar(1.0);
            if hack {
                let td = t.into_data();
                t = burn::tensor::Tensor::from_data(td, device);
            }
            out_vec.push(t);
        }

        let mut sum: f32 = 0.0;
        for t in out_vec {
            sum += t.sum().into_scalar().to_f32()
        }

        println!(
            "i: {}, sum: {}, elapsed: {}",
            i,
            sum,
            start.elapsed().as_secs_f64()
        );
    }
}

Expected behavior
Expected less than one second in iteration i:0 of backend: "fusion<jit>", device: Cuda(0)

testing backend: "ndarray", device: Cpu
i: 0, sum: 115605500, elapsed: 0.55636384
i: 1, sum: 115605500, elapsed: 0.462195338
i: 2, sum: 115605500, elapsed: 0.454074468
i: 3, sum: 115605500, elapsed: 0.395695947
i: 4, sum: 115605500, elapsed: 0.39472438
i: 5, sum: 115605500, elapsed: 0.394821323
i: 6, sum: 115605500, elapsed: 0.39547931
i: 7, sum: 115605500, elapsed: 0.395147096
testing backend: "candle", device: Cuda(CudaDevice { device: CudaDevice(DeviceId(1)), index: 0 })
i: 0, sum: 115605500, elapsed: 0.304088321
i: 1, sum: 115605500, elapsed: 0.244612769
i: 2, sum: 115605500, elapsed: 0.243556794
i: 3, sum: 115605500, elapsed: 0.24344835
i: 4, sum: 115605500, elapsed: 0.243947939
i: 5, sum: 115605500, elapsed: 0.243708218
i: 6, sum: 115605500, elapsed: 0.244127436
i: 7, sum: 115605500, elapsed: 0.243139218
testing backend: "tch", device: Cuda(0)
i: 0, sum: 115605500, elapsed: 0.206799282
i: 1, sum: 115605500, elapsed: 0.169857142
i: 2, sum: 115605500, elapsed: 0.168459104
i: 3, sum: 115605500, elapsed: 0.168022733
i: 4, sum: 115605500, elapsed: 0.168296829
i: 5, sum: 115605500, elapsed: 0.171444608
i: 6, sum: 115605500, elapsed: 0.171398742
i: 7, sum: 115605500, elapsed: 0.168834901
testing backend: "fusion<jit>", device: Cuda(0)
i: 0, sum: 115605500, elapsed: 9.259544625
i: 1, sum: 115605500, elapsed: 0.221510606
i: 2, sum: 115605500, elapsed: 0.218899816
i: 3, sum: 115605500, elapsed: 0.219641511
i: 4, sum: 115605500, elapsed: 0.220051723
i: 5, sum: 115605500, elapsed: 0.218108528
i: 6, sum: 115605500, elapsed: 0.218188098
i: 7, sum: 115605500, elapsed: 0.220182168

Screenshots

Desktop (please complete the following information):
Fedora 41, Nvidia cuda 12.6

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions