Skip to content

[DFT][cuFFT] Invalid strides error on CUDA #631

Open
@vlad-perevezentsev

Description

@vlad-perevezentsev

Summary

When performing DPF transform with oneapi::mkl::dft on NVIDIA GPU for multidimensional arrays >=2D where the last dimension is 1 [(2,1), (5,1), (2,3,1)....]
The computation fails with oneMKL: dft/backends/cufft/commit: Invalid strides

This problem does not occur when running on Intel.

Version

I am using the latest release version v0.6

Environment

  • HW you use : NVIDIA A100 80GB PCIe
  • Backend library version: Build cuda_12.4.r12.4/compiler.33961263_0
  • OS name and version : Linux Ubuntu 24.04 LTS 6.8.0-51-generic
  • Compiler version : Intel(R) oneAPI DPC++/C++ Compiler 2025.0.4 (2025.0.4.20241205)
$ sycl-ls
[opencl:cpu][opencl:0] Intel(R) OpenCL, Intel(R) Xeon(R) Platinum 8480+ OpenCL 3.0 (Build 0) [2024.18.12.0.05_160000]
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100 80GB PCIe 8.0 [CUDA 12.7]

Steps to reproduce

Building:

cmake .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DMKL_ROOT=${MKLROOT} -DBUILD_FUNCTIONAL_TESTS=OFF              \
    -DENABLE_CUBLAS_BACKEND=True  -DENABLE_CUSOLVER_BACKEND=True -DENABLE_CURAND_BACKEND=True -DENABLE_CUFFT_BACKEND=True

# build
cmake --build .

# install
export MKL_INTERFACE_ROOT=$(pwd)/../install
cmake --install . --prefix=${MKL_INTERFACE_ROOT}

Compiling:

icpx -fsycl fft_repro.cpp -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown -I${MKL_INTERFACE_ROOT}/include
\
-L${MKL_INTERFACE_ROOT}/lib -lonemkl -Wl,-rpath,${MKL_INTERFACE_ROOT}/lib -o run_repro

Running:

ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro
ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro 
Reproducer here
#include <iostream>
#include <vector>
#include <sycl/sycl.hpp>
#include <oneapi/mkl/dft.hpp>
#include <complex>

int main() {
    namespace dft = oneapi::mkl::dft;
    sycl::queue q(sycl::default_selector{});

    auto device = q.get_device();
    std::cout << "Device name: " << device.get_info<sycl::info::device::name>() << "\n";

    std::vector<std::int64_t> shape = {3, 1};
    const std::int64_t size = shape[0] * shape[1];

    std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});
    std::vector<std::complex<float>> h_output(size);

    std::complex<float>* d_data = sycl::malloc_device<std::complex<float>>(size, q);
    std::complex<float>* d_output = sycl::malloc_device<std::complex<float>>(size, q);

    q.memcpy(d_data, h_data.data(), size * sizeof(std::complex<float>)).wait();

    dft::descriptor<dft::precision::SINGLE, dft::domain::COMPLEX> desc(shape);

    std::vector<std::int64_t> fwd_strides = {0, 1, 1};
    std::vector<std::int64_t> bwd_strides = {0, 1, 1};

    desc.set_value(dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE);
    desc.set_value(dft::config_param::NUMBER_OF_TRANSFORMS, 1);
    // desc.set_value(dft::config_param::FWD_STRIDES, fwd_strides.data());
    // desc.set_value(dft::config_param::BWD_STRIDES, bwd_strides.data());

    desc.commit(q);

    sycl::event fft_event = dft::compute_forward(desc, d_data, d_output);
    fft_event.wait();

    q.memcpy(h_output.data(), d_output, size * sizeof(std::complex<float>)).wait();

    std::cout << "FFT Result:\n";
    for (std::int64_t i = 0; i < size; i++) {
        std::cout << h_output[i] << "\n";
    }

    sycl::free(d_data, q);
    sycl::free(d_output, q);

    return 0;
}

Observed behavior

When executing the attached code on NVIDIA the program fails with an error

Example output

$ ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro 

Device name: NVIDIA A100 80GB PCIe
terminate called after throwing an instance of 'oneapi::mkl::exception'
  what():  oneMKL: dft/backends/cufft/commit: Invalid strides.
Aborted (core dumped)

However, running the same code on Intel works as expected

$ ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro 

Device name: Intel(R) Xeon(R) Platinum 8480+
FFT Result:
(3,0)
(0,0)
(0,0)

Expected behavior

Similar implementation that uses cuFFT directly works correctly

$ ./fft_cuda 
FFT Result:
(3, 0)
(0, 0)
(0, 0)
Reproducer here
nvcc -o fft_cuda fft_cuda.cpp -lcufft
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <cufft.h>
#include <complex>


int main() {
    const int NX = 1;
    const int NY = 3;
    const int size = NX * NY;

    std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});

    cufftComplex* d_data;
    cudaMalloc(reinterpret_cast<void**>(&d_data), size * sizeof(cufftComplex));

    std::vector<cufftComplex> h_cufft_data(size);
    for (int i = 0; i < size; i++) {
        h_cufft_data[i].x = std::real(h_data[i]);
        h_cufft_data[i].y = std::imag(h_data[i]);
    }

    cudaMemcpy(d_data, h_cufft_data.data(), size * sizeof(cufftComplex), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan2d(&plan, NY, NX, CUFFT_C2C);

    cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);

    cudaMemcpy(h_cufft_data.data(), d_data, size * sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    std::cout << "FFT Result:\n";
    for (int i = 0; i < size; i++) {
        std::cout << "(" << h_cufft_data[i].x << ", " << h_cufft_data[i].y << ")\n";
    }

    cufftDestroy(plan);
    cudaFree(d_data);

    return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugA request to fix an issuehelp wantedTasks, issues or features that could be implemented and contributed to the project

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions