From 5c4e38ab17eb530e950e68e1d45ea7a2fcd25cea Mon Sep 17 00:00:00 2001 From: manjam01 <Manaal.Jamadar@arm.com> Date: Thu, 27 Feb 2025 09:39:06 +0000 Subject: [PATCH 1/5] Optimize gemv_n_sve kernel --- kernel/arm64/KERNEL.ARMV8SVE | 2 +- kernel/arm64/gemv_n_sve.c | 83 ++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index dc58e329fc..9adacce632 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -74,7 +74,7 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = gemv_n_sve.c DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/gemv_n_sve.c b/kernel/arm64/gemv_n_sve.c index 2950555615..59a5c85572 100644 --- a/kernel/arm64/gemv_n_sve.c +++ b/kernel/arm64/gemv_n_sve.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2024, The OpenBLAS Project +Copyright (c) 2024-2025, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,23 +59,82 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr = a; if (inc_y == 1) { + BLASLONG width = n / 3; uint64_t sve_size = SV_COUNT(); - for (j = 0; j < n; j++) { - SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); - i = 0; - svbool_t pg = SV_WHILE(i, m); - while (svptest_any(SV_TRUE(), pg)) { - SV_TYPE a_vec = svld1(pg, a_ptr + i); + svbool_t pg_true = SV_TRUE(); + svbool_t pg = SV_WHILE(0, m % sve_size); + + FLOAT *a0_ptr = a + lda * width * 0; + FLOAT *a1_ptr = a + lda * width * 1; + FLOAT *a2_ptr = a + lda * width * 2; + + for (j = 0; j < width; j++) { + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { + ix = j * inc_x; + + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + + SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); + + SV_TYPE y_vec = svld1(pg_true, y + i); + y_vec = svmla_lane(y_vec, a00_vec, x0_vec, 0); + y_vec = svmla_lane(y_vec, a01_vec, x1_vec, 0); + y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0); + + svst1(pg_true, y + i, y_vec); + } + + if (i < m) { + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + + SV_TYPE a00_vec = svld1(pg, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg, a2_ptr + i); + SV_TYPE y_vec = svld1(pg, y + i); - y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); + y_vec = svmla_m(pg, y_vec, a00_vec, x0_vec); + y_vec = svmla_m(pg, y_vec, a01_vec, x1_vec); + y_vec = svmla_m(pg, y_vec, a02_vec, x2_vec); + + ix += inc_x; + svst1(pg, y + i, y_vec); - i += sve_size; - pg = SV_WHILE(i, m); } + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + } + + a_ptr = a2_ptr; + for (j = width * 3; j < n; j++) { + ix = j * inc_x; + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { + SV_TYPE y_vec = svld1(pg_true, y + i); + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); + SV_TYPE a_vec = svld1(pg_true, a_ptr + i); + y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); + svst1(pg_true, y + i, y_vec); + } + + if (i < m) { + SV_TYPE y_vec = svld1(pg, y + i); + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); + SV_TYPE a_vec = svld1(pg, a_ptr + i); + y_vec = svmla_m(pg, y_vec, a_vec, x_vec); + svst1(pg, y + i, y_vec); + } + a_ptr += lda; ix += inc_x; } - return(0); + return (0); } for (j = 0; j < n; j++) { @@ -89,4 +148,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ix += inc_x; } return (0); -} +} \ No newline at end of file From 80d3c2ad95781211b77272a1cfc9d77ba7ec402d Mon Sep 17 00:00:00 2001 From: Masato Nakagawa <nakagawa.masato@fujitsu.com> Date: Tue, 11 Mar 2025 20:18:20 +0900 Subject: [PATCH 2/5] Add Improving Load Imbalance in Thread-Parallel GEMM --- driver/level3/level3_thread.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 9b1aadf7dc..77aaeee6b9 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -591,7 +591,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG nthreads = args -> nthreads; - BLASLONG width, i, j, k, js; + BLASLONG width, width_n, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; #if defined(DYNAMIC_ARCH) @@ -740,18 +740,25 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG /* Partition (a step of) n into nthreads regions */ range_N[0] = js; num_parts = 0; - while (n > 0){ - width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < switch_ratio) { - width = switch_ratio; + for(j = 0; j < nthreads_n; j++){ + width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j); + n -= width_n; + for(i = 0; i < nthreads_m; i++){ + width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i); + if (width < switch_ratio) { + width = switch_ratio; + } + width = round_up(width_n, width, GEMM_PREFERED_SIZE); + + width_n -= width; + if (width_n < 0) { + width = width + width_n; + width_n = 0; + } + range_N[num_parts + 1] = range_N[num_parts] + width; + + num_parts ++; } - width = round_up(n, width, GEMM_PREFERED_SIZE); - - n -= width; - if (n < 0) width = width + n; - range_N[num_parts + 1] = range_N[num_parts] + width; - - num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { range_N[j + 1] = range_N[num_parts]; From a085b6c9ec38de7109fe95322db677fc18c31696 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat <annop.wongwathanarat@arm.com> Date: Wed, 12 Mar 2025 14:49:10 +0000 Subject: [PATCH 3/5] Fix aarch64 sbgemv_t compilation error for GCC < 13 --- CONTRIBUTORS.md | 1 + kernel/arm64/sbgemv_t_bfdot.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 041582892d..938a3bf918 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -237,6 +237,7 @@ In chronological order: * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel * [2025-02-26] Add sbgemv_t_bfdot kernel + * [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 * Marek Michalowski <marek.michalowski@arm.com> * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c index 0751690fcd..fc4ae019e9 100644 --- a/kernel/arm64/sbgemv_t_bfdot.c +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -33,11 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <arm_neon.h> #include "common.h" -static inline float bf16_to_fp32(bfloat16 bf16) { - uint32_t fp32 = (uint32_t)bf16 << 16; - return *((float*)&fp32); -} - int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { if (m < 1 || n < 1) return(0); @@ -132,10 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat } for (; i < m; ++i) { - y0_ptr[iy] += alpha * a0_ptr[i] * x_ptr[i]; - y1_ptr[iy] += alpha * a1_ptr[i] * x_ptr[i]; - y2_ptr[iy] += alpha * a2_ptr[i] * x_ptr[i]; - y3_ptr[iy] += alpha * a3_ptr[i] * x_ptr[i]; + y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } iy += incy; @@ -177,7 +172,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat } for (; i < m; ++i) { - y_ptr[iy] += alpha * a_ptr[i] * x_ptr[i]; + y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } iy += incy; @@ -191,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat temp = 0.0; ix = 0; for (i = 0; i < m; i++) { - temp += bf16_to_fp32(a[i]) * bf16_to_fp32(x[ix]); + temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]); ix += incx; } if (beta == 0.0f) { From a7448970bed55e31543d21e64191412e7f20c2ab Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" <Shubham.Chaudhari@fujitsu.com> Date: Fri, 28 Feb 2025 13:10:40 +0530 Subject: [PATCH 4/5] Add thread throttling profile for DGEMV on NEOVERSEV1 --- interface/gemv.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index d031339463..360b82dcd2 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -89,6 +89,24 @@ static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { } #endif +//thread throttling for dgemv +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_dgemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { + + return + MN < 8100L ? 1 +: MN < 12100L ? MIN(ncpu, 2) +: MN < 36100L ? MIN(ncpu, 4) +: MN < 84100L ? MIN(ncpu, 8) +: MN < 348100L ? MIN(ncpu, 16) +: MN < 435600L ? MIN(ncpu, 24) +: MN < 810000L ? MIN(ncpu, 32) +: MN < 1050625 ? MIN(ncpu, 40) +: ncpu; + +} +#endif + static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); #if defined(_WIN64) && defined(_M_ARM64) @@ -98,6 +116,8 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { #endif #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); +#elif defined(NEOVERSEV1) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) + return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) From d7a2b6b084ff71c25483a0e83e8de8fa48e4ee21 Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" <Shubham.Chaudhari@fujitsu.com> Date: Tue, 4 Mar 2025 16:08:55 +0530 Subject: [PATCH 5/5] Add thread throttling for dynamic arch neoversev1 --- interface/gemv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index 360b82dcd2..22409649e4 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -127,6 +127,12 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { if (strcmp(gotoblas_corename(), "neoversev2") == 0) { return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); } +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); + } + + #endif if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD )