Open
Description
Summary
When performing DPF transform
with oneapi::mkl::dft
on NVIDIA GPU for multidimensional arrays >=2D where the last dimension is 1
[(2,1), (5,1), (2,3,1)....]
The computation fails with oneMKL: dft/backends/cufft/commit: Invalid strides
This problem does not occur when running on Intel.
Version
I am using the latest release version v0.6
Environment
- HW you use : NVIDIA A100 80GB PCIe
- Backend library version: Build cuda_12.4.r12.4/compiler.33961263_0
- OS name and version : Linux Ubuntu 24.04 LTS 6.8.0-51-generic
- Compiler version : Intel(R) oneAPI DPC++/C++ Compiler 2025.0.4 (2025.0.4.20241205)
$ sycl-ls
[opencl:cpu][opencl:0] Intel(R) OpenCL, Intel(R) Xeon(R) Platinum 8480+ OpenCL 3.0 (Build 0) [2024.18.12.0.05_160000]
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100 80GB PCIe 8.0 [CUDA 12.7]
Steps to reproduce
Building:
cmake .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DMKL_ROOT=${MKLROOT} -DBUILD_FUNCTIONAL_TESTS=OFF \
-DENABLE_CUBLAS_BACKEND=True -DENABLE_CUSOLVER_BACKEND=True -DENABLE_CURAND_BACKEND=True -DENABLE_CUFFT_BACKEND=True
# build
cmake --build .
# install
export MKL_INTERFACE_ROOT=$(pwd)/../install
cmake --install . --prefix=${MKL_INTERFACE_ROOT}
Compiling:
icpx -fsycl fft_repro.cpp -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown -I${MKL_INTERFACE_ROOT}/include
\
-L${MKL_INTERFACE_ROOT}/lib -lonemkl -Wl,-rpath,${MKL_INTERFACE_ROOT}/lib -o run_repro
Running:
ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro
ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro
Reproducer here
#include <iostream>
#include <vector>
#include <sycl/sycl.hpp>
#include <oneapi/mkl/dft.hpp>
#include <complex>
int main() {
namespace dft = oneapi::mkl::dft;
sycl::queue q(sycl::default_selector{});
auto device = q.get_device();
std::cout << "Device name: " << device.get_info<sycl::info::device::name>() << "\n";
std::vector<std::int64_t> shape = {3, 1};
const std::int64_t size = shape[0] * shape[1];
std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});
std::vector<std::complex<float>> h_output(size);
std::complex<float>* d_data = sycl::malloc_device<std::complex<float>>(size, q);
std::complex<float>* d_output = sycl::malloc_device<std::complex<float>>(size, q);
q.memcpy(d_data, h_data.data(), size * sizeof(std::complex<float>)).wait();
dft::descriptor<dft::precision::SINGLE, dft::domain::COMPLEX> desc(shape);
std::vector<std::int64_t> fwd_strides = {0, 1, 1};
std::vector<std::int64_t> bwd_strides = {0, 1, 1};
desc.set_value(dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE);
desc.set_value(dft::config_param::NUMBER_OF_TRANSFORMS, 1);
// desc.set_value(dft::config_param::FWD_STRIDES, fwd_strides.data());
// desc.set_value(dft::config_param::BWD_STRIDES, bwd_strides.data());
desc.commit(q);
sycl::event fft_event = dft::compute_forward(desc, d_data, d_output);
fft_event.wait();
q.memcpy(h_output.data(), d_output, size * sizeof(std::complex<float>)).wait();
std::cout << "FFT Result:\n";
for (std::int64_t i = 0; i < size; i++) {
std::cout << h_output[i] << "\n";
}
sycl::free(d_data, q);
sycl::free(d_output, q);
return 0;
}
Observed behavior
When executing the attached code on NVIDIA the program fails with an error
Example output
$ ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro
Device name: NVIDIA A100 80GB PCIe
terminate called after throwing an instance of 'oneapi::mkl::exception'
what(): oneMKL: dft/backends/cufft/commit: Invalid strides.
Aborted (core dumped)
However, running the same code on Intel works as expected
$ ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro
Device name: Intel(R) Xeon(R) Platinum 8480+
FFT Result:
(3,0)
(0,0)
(0,0)
Expected behavior
Similar implementation that uses cuFFT directly works correctly
$ ./fft_cuda
FFT Result:
(3, 0)
(0, 0)
(0, 0)
Reproducer here
nvcc -o fft_cuda fft_cuda.cpp -lcufft
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <cufft.h>
#include <complex>
int main() {
const int NX = 1;
const int NY = 3;
const int size = NX * NY;
std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});
cufftComplex* d_data;
cudaMalloc(reinterpret_cast<void**>(&d_data), size * sizeof(cufftComplex));
std::vector<cufftComplex> h_cufft_data(size);
for (int i = 0; i < size; i++) {
h_cufft_data[i].x = std::real(h_data[i]);
h_cufft_data[i].y = std::imag(h_data[i]);
}
cudaMemcpy(d_data, h_cufft_data.data(), size * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan2d(&plan, NY, NX, CUFFT_C2C);
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
cudaMemcpy(h_cufft_data.data(), d_data, size * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
std::cout << "FFT Result:\n";
for (int i = 0; i < size; i++) {
std::cout << "(" << h_cufft_data[i].x << ", " << h_cufft_data[i].y << ")\n";
}
cufftDestroy(plan);
cudaFree(d_data);
return 0;
}