Description
Summary
When performing QR decomposition on CUDA device the computed matrix Q
contains incorrect elements during multiple runs for matrices of size 256x256
or larger.
This problem does not occur when running on Intel.
Version
I am using the latest release version v0.6
Environment
- HW you use : NVIDIA A100 80GB PCIe
- Backend library version: Build cuda_12.4.r12.4/compiler.33961263_0
- OS name and version : Linux Ubuntu 24.04 LTS 6.8.0-51-generic
- Compiler version : Intel(R) oneAPI DPC++/C++ Compiler 2025.0.4 (2025.0.4.20241205)
$ sycl-ls
[opencl:cpu][opencl:0] Intel(R) OpenCL, Intel(R) Xeon(R) Platinum 8480+ OpenCL 3.0 (Build 0) [2024.18.12.0.05_160000]
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100 80GB PCIe 8.0 [CUDA 12.4]
Steps to reproduce
Building:
cmake .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DMKL_ROOT=${MKLROOT} -DBUILD_FUNCTIONAL_TESTS=OFF \
-DENABLE_CUBLAS_BACKEND=True -DENABLE_CUSOLVER_BACKEND=True -DENABLE_CURAND_BACKEND=True -DENABLE_CUFFT_BACKEND=True
# build
cmake --build .
# install
export MKL_INTERFACE_ROOT=$(pwd)/../install
cmake --install . --prefix=${MKL_INTERFACE_ROOT}
Compiling:
icpx -fsycl geqrf_orgqr_repro.cpp -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown -I${MKL_INTERFACE_ROOT}/include
\
-L${MKL_INTERFACE_ROOT}/lib -lonemkl -Wl,-rpath,${MKL_INTERFACE_ROOT}/lib -o run_repro
Running:
./run_repro 256 # set matrix size
Reproducer here
#include <iostream>
#include <vector>
#include <sycl/sycl.hpp>
#include "oneapi/mkl.hpp"
#include <mkl_version.h>
void run_qr_decomposition(sycl::queue& q,
std::vector<float>& diagonal_elements,
std::int64_t n)
{
using T = float;
const std::int64_t lda = n;
std::vector<T> h_A(n * n, 0.0f);
for (std::int64_t i = 0; i < n; ++i) {
h_A[i * n + i] = static_cast<T>(2);
}
T* d_A = sycl::malloc_device<T>(n * n, q);
T* d_tau = sycl::malloc_device<T>(n, q);
auto copy_event = q.memcpy(d_A, h_A.data(), h_A.size() * sizeof(T));
const std::int64_t scratchpad_size_geqrf =
oneapi::mkl::lapack::geqrf_scratchpad_size<T>(q, n, n, lda);
T* scratchpad_geqrf = sycl::malloc_device<T>(scratchpad_size_geqrf, q);
sycl::event geqrf_event;
try {
geqrf_event = oneapi::mkl::lapack::geqrf(q, n, n, d_A, lda, d_tau, scratchpad_geqrf, scratchpad_size_geqrf, {copy_event});
const std::int64_t scratchpad_size_orgqr =
oneapi::mkl::lapack::orgqr_scratchpad_size<T>(q, n, n, n, lda);
T* scratchpad_orgqr = sycl::malloc_device<T>(scratchpad_size_orgqr, q);
auto orgqr_event = oneapi::mkl::lapack::orgqr(q, n, n, n, d_A, lda, d_tau, scratchpad_orgqr, scratchpad_size_orgqr, {geqrf_event});
orgqr_event.wait();
sycl::free(scratchpad_orgqr, q);
} catch (sycl::exception const& e) {
std::cout << "SYCL exception: " << e.what() << std::endl;
return;
} catch (oneapi::mkl::lapack::exception const& e) {
std::cout << "MKL exception: " << e.what() << ", info: " << e.info() << std::endl;
return;
} catch (...) {
std::cout << "Unknown exception occurred." << std::endl;
return;
}
std::vector<T> h_Q(n * n);
q.memcpy(h_Q.data(), d_A, h_Q.size() * sizeof(T)).wait();
for (std::int64_t i = 0; i < n; ++i) {
diagonal_elements.push_back(h_Q[i * n + i]);
}
sycl::free(d_A, q);
sycl::free(d_tau, q);
sycl::free(scratchpad_geqrf, q);
}
int main(int argc, char* argv[]) {
if (argc != 2) {
std::cout << "Usage: " << argv[0] << " <matrix size n>\n";
return 1;
}
std::int64_t n = std::atoi(argv[1]);
if (n <= 0) {
std::cout << "Matrix size n must be a positive integer.\n";
return 1;
}
std::cout << "Matrix size: " << n << "x" << n << " with diagonal elements set to 2.\n";
sycl::queue q_cpu(sycl::cpu_selector{});
sycl::queue q_gpu(sycl::gpu_selector{});
std::cout << "Intel OneMKL version: " << INTEL_MKL_VERSION << std::endl;
const auto &d_cpu = q_cpu.get_device();
const auto &name_cpu = d_cpu.get_info<sycl::info::device::name>();
const auto &dver_cpu = d_cpu.get_info<sycl::info::device::driver_version>();
const auto &d_gpu = q_gpu.get_device();
const auto &name_gpu = d_gpu.get_info<sycl::info::device::name>();
const auto &dver_gpu = d_gpu.get_info<sycl::info::device::driver_version>();
std::cout << "CPU info: " << "Driver: " << name_cpu << " [" << dver_cpu << " ]" << std::endl;
std::cout << "GPU info: " << "Driver: " << name_gpu << " [" << dver_gpu << " ]" << std::endl;
std::vector<float> diag_cpu;
std::vector<float> diag_gpu;
for (int i = 0; i < 2; ++i) {
std::cout << "Run " << i + 1 << " on CPU:" << std::endl;
run_qr_decomposition(q_cpu, diag_cpu, n);
std::cout << "Ok" << std::endl;
}
for (int i = 0; i < 2; ++i) {
std::cout << "Run " << i + 1 << " on GPU:" << std::endl;
run_qr_decomposition(q_gpu, diag_gpu, n);
std::cout << "Ok" << std::endl;
}
std::cout << "Comparing diagonal elements CPU vs GPU: \n";
for (std::size_t i = 0; i < diag_cpu.size(); ++i) {
if (std::abs(diag_cpu[i] - diag_gpu[i]) > 1e-6) {
std::cout << "Run " << (i / n + 1) << ", Row: " << (i % n)
<< " CPU: " << diag_cpu[i]
<< " GPU: " << diag_gpu[i]
<< " Difference: " << std::abs(diag_cpu[i] - diag_gpu[i]) << std::endl;
}
}
return 0;
}
Observed behavior
For simplicity, a diagonal matrix with elements set to 2
is used as input.
After the first QR decomposition the diagonal elements of the matrix Q
are correct (1
as expected).
However on later runs the diagonal elements remain set to 2
resulting in incorrect results.
This problem is not observed on Intel devices.
Example output
$ ./run_repro 256
Matrix size: 256x256 with diagonal elements set to 2.
Intel OneMKL version: 20250001
CPU info: Driver: Intel(R) Xeon(R) Platinum 8480+ [2024.18.12.0.05_160000 ]
GPU info: Driver: NVIDIA A100 80GB PCIe [CUDA 12.4 ]
Run 1 on CPU:
Ok
Run 2 on CPU:
Ok
Run 1 on GPU:
Ok
Run 2 on GPU:
Ok
Comparing diagonal elements CPU vs GPU:
Run 2, Row: 86 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 87 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 88 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 89 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 90 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 91 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 92 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 93 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 94 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 95 CPU: 1 GPU: 2 Difference: 1
Run 2, Row: 96 CPU: 1 GPU: 2 Difference: 1
...
Expected behavior
The diagonal elements of the Q
matrix should be 1
after QR decomposition for every run regardless of the device used as the input matrix is diagonal.