[CUDA] Fix build for sm<53 (#24582)

tianleiwu · web-flow · commit 76cee36f0f38 · 2025-04-28T19:16:30.000-07:00
### Description

There is some build error for `--cmake_extra_defines
CMAKE_CUDA_ARCHITECTURES=52`.

Some half2 function like `__hfma2` used in MatMul 8 bits is not defined
for sm &lt; 53. Add an implementation that does not use half2 for those old
GPUs.

Fix another build error using cuda 12.5 that is caused by extra `const`
in MOE code for sm&lt;53.

### Motivation and Context

Fix nuget packaging pipeline, which uses
`CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90-virtual`.
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -1041,7 +1041,7 @@ void initialize_moe_routing_kernelLauncher(const T *unpermuted_input, T *permute
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
 template <typename T, int RESIDUAL_NUM>
 __global__ void finalize_moe_routing_kernel(const T *, T *, const T *, const T *, const T *, const T *, const int *,
-                                            const int *, int, const int) {
+                                            const int *, int, int) {
     // Does not support pre-Kepler architectures
     ;
 }
@@ -1168,4 +1168,4 @@ template void finalize_moe_routing_kernelLauncher(const float *, float *, const
 template void finalize_moe_routing_kernelLauncher(const half *, half *, const half *, const half *, const half *,
                                                   const half *, const int *, const int *, int, int, int, cudaStream_t);
 
-} // namespace ort_fastertransformer
+} // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_8bits.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_8bits.cu
@@ -32,6 +32,7 @@ __device__ __forceinline__ void AccumulateEightElements8b(
     const half* a,          // Pointer to 8 half values from A
     half* sums) {           // Pointer to 8 partial sums (half)
 
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530)
   // --- Dequantization Setup ---
   half2 scale_h2 = __half2half2(scale);  // Broadcast scale
   half zp_h = __ushort2half_rn(zp);      // Convert zp to half
@@ -74,6 +75,27 @@ __device__ __forceinline__ void AccumulateEightElements8b(
   sums_half2[1] = __hfma2(a_vec1, b_vec1, sums_half2[1]);  // {s2+=a2*b2, s3+=a3*b3}
   sums_half2[2] = __hfma2(a_vec2, b_vec2, sums_half2[2]);  // {s4+=a4*b4, s5+=a5*b5}
   sums_half2[3] = __hfma2(a_vec3, b_vec3, sums_half2[3]);  // {s6+=a6*b6, s7+=a7*b7}
+
+#else // older GPUs of compute capability < 5.3, which lacks native half support.
+  float scale_f = __half2float(scale);
+  float zp_f = static_cast<float>(zp);
+
+  float b_dequant[8];
+#pragma unroll
+  for (int i = 0; i < 8; ++i) {
+    uint8_t q = (values_quant >> (i * 8)) & 0xFF;
+    b_dequant[i] = (static_cast<float>(q) - zp_f) * scale_f;
+  }
+
+#pragma unroll
+  for (int i = 0; i < 8; ++i) {
+    float a_f = __half2float(a[i]);
+    float product_f = a_f * b_dequant[i];
+    // Convert back to half for partial sums. It is not ideal for performance.
+    half product_h = __float2half_rn(product_f);
+    sums[i] += product_h;
+  }
+#endif
 }
 
 // --- Device Function: Accumulate 8 Elements (float precision) ---

Original file line number	Diff line number	Diff line change
`@@ -1041,7 +1041,7 @@ void initialize_moe_routing_kernelLauncher(const T unpermuted_input, T permute`
`1041`	`1041`	`#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530`
`1042`	`1042`	`template <typename T, int RESIDUAL_NUM>`
`1043`	`1043`	`__global__ void finalize_moe_routing_kernel(const T , T , const T , const T , const T , const T , const int *,`
`1044`		`- const int *, int, const int) {`
	`1044`	`+ const int *, int, int) {`
`1045`	`1045`	`// Does not support pre-Kepler architectures`
`1046`	`1046`	`;`
`1047`	`1047`	`}`
`@@ -1168,4 +1168,4 @@ template void finalize_moe_routing_kernelLauncher(const float , float , const`
`1168`	`1168`	`template void finalize_moe_routing_kernelLauncher(const half , half , const half , const half , const half *,`
`1169`	`1169`	`const half , const int , const int *, int, int, int, cudaStream_t);`
`1170`	`1170`
`1171`		`-} // namespace ort_fastertransformer`
	`1171`	`+} // namespace ort_fastertransformer`