-
Notifications
You must be signed in to change notification settings - Fork 825
Description
Describe the bug
The Cuda backend seems to have a performance issue when keeping results in VMEM.
The following example is very slow in the first iteration (about 9 seconds).
Following iterations are fast (about 0.2 seconds)
Copying the VMEM tensor into Tensordata and back to a VMEM tensor results in a speed up of the first iteration.
(hack = true in the example)
This was tested on the current main branch.
To Reproduce
fn test_memory_alloc() {
unsafe { libloading::Library::new("libtorch.so").unwrap() };
// number of loops
let loops = 8;
test_device_memory_alloc::<burn::backend::NdArray>(
&burn::backend::ndarray::NdArrayDevice::Cpu,
loops,
);
test_device_memory_alloc::<burn::backend::Candle>(
&burn::backend::candle::CandleDevice::cuda(0),
loops,
);
test_device_memory_alloc::<burn::backend::LibTorch>(
&burn::backend::libtorch::LibTorchDevice::Cuda(0),
loops,
);
test_device_memory_alloc::<burn::backend::Cuda>(
&burn::backend::cuda::CudaDevice::new(0),
loops,
);
}
fn test_device_memory_alloc<B: Backend>(device: &<B as Backend>::Device, loops: usize) {
println!("testing backend: {:?}, device: {:?}", B::name(), device);
const DIM: usize = 4;
let shape = [32, 4, 84, 84];
let l = shape.iter().product();
let data_v = vec![0; l];
let steps = 128;
let hack = false;
for i in 0..loops {
let start = std::time::Instant::now();
let mut in_vec = vec![];
for _ in 0..steps {
let td = TensorData::new(data_v.clone(), shape);
let t: Tensor<B, DIM> = Tensor::from_data(td, device);
in_vec.push(t);
}
let mut out_vec = vec![];
for mut t in in_vec {
t = t.add_scalar(1.0);
if hack {
let td = t.into_data();
t = burn::tensor::Tensor::from_data(td, device);
}
out_vec.push(t);
}
let mut sum: f32 = 0.0;
for t in out_vec {
sum += t.sum().into_scalar().to_f32()
}
println!(
"i: {}, sum: {}, elapsed: {}",
i,
sum,
start.elapsed().as_secs_f64()
);
}
}
Expected behavior
Expected less than one second in iteration i:0 of backend: "fusion<jit>", device: Cuda(0)
testing backend: "ndarray", device: Cpu
i: 0, sum: 115605500, elapsed: 0.55636384
i: 1, sum: 115605500, elapsed: 0.462195338
i: 2, sum: 115605500, elapsed: 0.454074468
i: 3, sum: 115605500, elapsed: 0.395695947
i: 4, sum: 115605500, elapsed: 0.39472438
i: 5, sum: 115605500, elapsed: 0.394821323
i: 6, sum: 115605500, elapsed: 0.39547931
i: 7, sum: 115605500, elapsed: 0.395147096
testing backend: "candle", device: Cuda(CudaDevice { device: CudaDevice(DeviceId(1)), index: 0 })
i: 0, sum: 115605500, elapsed: 0.304088321
i: 1, sum: 115605500, elapsed: 0.244612769
i: 2, sum: 115605500, elapsed: 0.243556794
i: 3, sum: 115605500, elapsed: 0.24344835
i: 4, sum: 115605500, elapsed: 0.243947939
i: 5, sum: 115605500, elapsed: 0.243708218
i: 6, sum: 115605500, elapsed: 0.244127436
i: 7, sum: 115605500, elapsed: 0.243139218
testing backend: "tch", device: Cuda(0)
i: 0, sum: 115605500, elapsed: 0.206799282
i: 1, sum: 115605500, elapsed: 0.169857142
i: 2, sum: 115605500, elapsed: 0.168459104
i: 3, sum: 115605500, elapsed: 0.168022733
i: 4, sum: 115605500, elapsed: 0.168296829
i: 5, sum: 115605500, elapsed: 0.171444608
i: 6, sum: 115605500, elapsed: 0.171398742
i: 7, sum: 115605500, elapsed: 0.168834901
testing backend: "fusion<jit>", device: Cuda(0)
i: 0, sum: 115605500, elapsed: 9.259544625
i: 1, sum: 115605500, elapsed: 0.221510606
i: 2, sum: 115605500, elapsed: 0.218899816
i: 3, sum: 115605500, elapsed: 0.219641511
i: 4, sum: 115605500, elapsed: 0.220051723
i: 5, sum: 115605500, elapsed: 0.218108528
i: 6, sum: 115605500, elapsed: 0.218188098
i: 7, sum: 115605500, elapsed: 0.220182168
Screenshots
Desktop (please complete the following information):
Fedora 41, Nvidia cuda 12.6