From f5e548f24500fdee996a247837197caff795414d Mon Sep 17 00:00:00 2001 From: usamoi Date: Thu, 28 Aug 2025 10:47:47 +0800 Subject: [PATCH] feat: ppc64le simd Signed-off-by: usamoi --- crates/simd/src/bit.rs | 22 +++++--- crates/simd/src/f16.rs | 68 ++++++++++++++++------ crates/simd/src/f32.rs | 48 ++++++++++------ crates/simd/src/fast_scan.rs | 96 +++++++++++++++++++++++++++++--- crates/simd/src/fht.rs | 6 +- crates/simd/src/lib.rs | 44 +++++++++++++++ crates/simd/src/quantize.rs | 2 +- crates/simd/src/rotate.rs | 8 ++- crates/simd/src/u8.rs | 6 +- crates/simd_macros/src/target.rs | 29 ++++++++++ 10 files changed, 270 insertions(+), 59 deletions(-) diff --git a/crates/simd/src/bit.rs b/crates/simd/src/bit.rs index 7538f7f8..7bc4305a 100644 --- a/crates/simd/src/bit.rs +++ b/crates/simd/src/bit.rs @@ -183,7 +183,7 @@ mod reduce_sum_of_and { } } - #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_and(lhs: &[u64], rhs: &[u64]) -> u32 { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -366,7 +366,7 @@ mod reduce_sum_of_or { } } - #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_or(lhs: &[u64], rhs: &[u64]) -> u32 { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -549,7 +549,7 @@ mod reduce_sum_of_xor { } } - #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_xor(lhs: &[u64], rhs: &[u64]) -> u32 { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -772,7 +772,7 @@ mod reduce_sum_of_and_or { } } - #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_and_or(lhs: &[u64], rhs: &[u64]) -> (u32, u32) { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -933,7 +933,7 @@ mod reduce_sum_of_x { } } - #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4:avx512vpopcntdq", @"v4", @"v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_x(this: &[u64]) -> u32 { let n = this.len(); let mut sum = 0; @@ -950,7 +950,9 @@ pub fn vector_and(lhs: &[u64], rhs: &[u64]) -> Vec { } mod vector_and { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_and(lhs: &[u64], rhs: &[u64]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -973,7 +975,9 @@ pub fn vector_or(lhs: &[u64], rhs: &[u64]) -> Vec { } mod vector_or { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_or(lhs: &[u64], rhs: &[u64]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -996,7 +1000,9 @@ pub fn vector_xor(lhs: &[u64], rhs: &[u64]) -> Vec { } mod vector_xor { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_xor(lhs: &[u64], rhs: &[u64]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); diff --git a/crates/simd/src/f16.rs b/crates/simd/src/f16.rs index c34f12aa..eeaef4ff 100644 --- a/crates/simd/src/f16.rs +++ b/crates/simd/src/f16.rs @@ -161,7 +161,9 @@ impl Floating for f16 { mod reduce_or_of_is_zero_x { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_or_of_is_zero_x(this: &[f16]) -> bool { for &x in this { if x == f16::ZERO { @@ -177,7 +179,9 @@ mod reduce_sum_of_x { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_sum_of_x(this: &[f16]) -> f32 { let n = this.len(); let mut x = 0.0f32; @@ -193,7 +197,9 @@ mod reduce_sum_of_abs_x { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_sum_of_abs_x(this: &[f16]) -> f32 { let n = this.len(); let mut x = 0.0f32; @@ -209,7 +215,9 @@ mod reduce_sum_of_x2 { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_sum_of_x2(this: &[f16]) -> f32 { let n = this.len(); let mut x2 = 0.0f32; @@ -225,7 +233,9 @@ mod reduce_min_max_of_x { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_min_max_of_x(this: &[f16]) -> (f32, f32) { let mut min = f32::INFINITY; let mut max = f32::NEG_INFINITY; @@ -501,7 +511,7 @@ mod reduce_sum_of_xy { } } - #[crate::multiversion(@"v4:avx512fp16", @"v4", @"v3", #[cfg(target_endian = "little")] @"a3.512", @"a2:fp16", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4:avx512fp16", @"v4", @"v3", #[cfg(target_endian = "little")] @"a3.512", @"a2:fp16", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_xy(lhs: &[f16], rhs: &[f16]) -> f32 { assert!(lhs.len() == rhs.len()); let n = lhs.len(); @@ -784,7 +794,7 @@ mod reduce_sum_of_d2 { } } - #[crate::multiversion(@"v4:avx512fp16", @"v4", @"v3", #[cfg(target_endian = "little")] @"a3.512", @"a2:fp16", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4:avx512fp16", @"v4", @"v3", #[cfg(target_endian = "little")] @"a3.512", @"a2:fp16", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_d2(lhs: &[f16], rhs: &[f16]) -> f32 { assert!(lhs.len() == rhs.len()); let n = lhs.len(); @@ -803,7 +813,9 @@ mod reduce_sum_of_xy_sparse { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_sum_of_xy_sparse(lidx: &[u32], lval: &[f16], ridx: &[u32], rval: &[f16]) -> f32 { use std::cmp::Ordering; assert_eq!(lidx.len(), lval.len()); @@ -836,7 +848,9 @@ mod reduce_sum_of_d2_sparse { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_sum_of_d2_sparse(lidx: &[u32], lval: &[f16], ridx: &[u32], rval: &[f16]) -> f32 { use std::cmp::Ordering; assert_eq!(lidx.len(), lval.len()); @@ -875,7 +889,9 @@ mod reduce_sum_of_d2_sparse { mod vector_add { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_add(lhs: &[f16], rhs: &[f16]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -895,7 +911,9 @@ mod vector_add { mod vector_add_inplace { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_add_inplace(lhs: &mut [f16], rhs: &[f16]) { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -908,7 +926,9 @@ mod vector_add_inplace { mod vector_sub { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_sub(lhs: &[f16], rhs: &[f16]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -928,7 +948,9 @@ mod vector_sub { mod vector_mul { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_mul(lhs: &[f16], rhs: &[f16]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -948,7 +970,9 @@ mod vector_mul { mod vector_mul_scalar { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_mul_scalar(lhs: &[f16], rhs: f32) -> Vec { let rhs = f16::from_f32(rhs); let n = lhs.len(); @@ -968,7 +992,9 @@ mod vector_mul_scalar { mod vector_mul_scalar_inplace { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_mul_scalar_inplace(lhs: &mut [f16], rhs: f32) { let rhs = f16::from_f32(rhs); let n = lhs.len(); @@ -981,7 +1007,9 @@ mod vector_mul_scalar_inplace { mod vector_abs_inplace { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_abs_inplace(this: &mut [f16]) { let n = this.len(); for i in 0..n { @@ -993,7 +1021,9 @@ mod vector_abs_inplace { mod vector_from_f32 { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_from_f32(this: &[f32]) -> Vec { let n = this.len(); let mut r = Vec::::with_capacity(n); @@ -1012,7 +1042,9 @@ mod vector_from_f32 { mod vector_to_f32 { use super::*; - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_to_f32(this: &[f16]) -> Vec { let n = this.len(); let mut r = Vec::::with_capacity(n); diff --git a/crates/simd/src/f32.rs b/crates/simd/src/f32.rs index 01136328..13aae3da 100644 --- a/crates/simd/src/f32.rs +++ b/crates/simd/src/f32.rs @@ -147,7 +147,9 @@ impl Floating for f32 { } mod reduce_or_of_is_zero_x { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn reduce_or_of_is_zero_x(this: &[f32]) -> bool { for &x in this { if x == 0.0f32 { @@ -411,7 +413,7 @@ mod reduce_sum_of_x { } } - #[crate::multiversion(@"v4", @"v3", @"v2", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_x(this: &[f32]) -> f32 { let n = this.len(); let mut sum = 0.0f32; @@ -686,7 +688,7 @@ mod reduce_sum_of_abs_x { } } - #[crate::multiversion(@"v4", @"v3", @"v2", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_abs_x(this: &[f32]) -> f32 { let n = this.len(); let mut sum = 0.0f32; @@ -951,7 +953,7 @@ mod reduce_sum_of_x2 { } } - #[crate::multiversion(@"v4", @"v3", @"v2:fma", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2:fma", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_x2(this: &[f32]) -> f32 { let n = this.len(); let mut x2 = 0.0f32; @@ -1222,7 +1224,7 @@ mod reduce_min_max_of_x { } } - #[crate::multiversion(@"v4", @"v3", @"v2", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_min_max_of_x(this: &[f32]) -> (f32, f32) { let mut min = f32::INFINITY; let mut max = f32::NEG_INFINITY; @@ -1539,7 +1541,7 @@ mod reduce_sum_of_xy { } } - #[crate::multiversion(@"v4", @"v3", @"v2:fma", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2:fma", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_xy(lhs: &[f32], rhs: &[f32]) -> f32 { assert!(lhs.len() == rhs.len()); let n = lhs.len(); @@ -1864,7 +1866,7 @@ mod reduce_sum_of_d2 { } } - #[crate::multiversion(@"v4", @"v3", @"v2:fma", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2:fma", #[cfg(target_endian = "little")] @"a3.256", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_d2(lhs: &[f32], rhs: &[f32]) -> f32 { assert!(lhs.len() == rhs.len()); let n = lhs.len(); @@ -1966,7 +1968,7 @@ mod reduce_sum_of_xy_sparse { } } - #[crate::multiversion(@"v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_xy_sparse(lidx: &[u32], lval: &[f32], ridx: &[u32], rval: &[f32]) -> f32 { use std::cmp::Ordering; assert_eq!(lidx.len(), lval.len()); @@ -2116,7 +2118,7 @@ mod reduce_sum_of_d2_sparse { } } - #[crate::multiversion(@"v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_d2_sparse(lidx: &[u32], lval: &[f32], ridx: &[u32], rval: &[f32]) -> f32 { use std::cmp::Ordering; assert_eq!(lidx.len(), lval.len()); @@ -2153,7 +2155,9 @@ mod reduce_sum_of_d2_sparse { } mod vector_add { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_add(lhs: &[f32], rhs: &[f32]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -2171,7 +2175,9 @@ mod vector_add { } mod vector_add_inplace { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_add_inplace(lhs: &mut [f32], rhs: &[f32]) { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -2182,7 +2188,9 @@ mod vector_add_inplace { } mod vector_sub { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_sub(lhs: &[f32], rhs: &[f32]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -2200,7 +2208,9 @@ mod vector_sub { } mod vector_mul { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_mul(lhs: &[f32], rhs: &[f32]) -> Vec { assert_eq!(lhs.len(), rhs.len()); let n = lhs.len(); @@ -2218,7 +2228,9 @@ mod vector_mul { } mod vector_mul_scalar { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_mul_scalar(lhs: &[f32], rhs: f32) -> Vec { let n = lhs.len(); let mut r = Vec::::with_capacity(n); @@ -2235,7 +2247,9 @@ mod vector_mul_scalar { } mod vector_mul_scalar_inplace { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_mul_scalar_inplace(lhs: &mut [f32], rhs: f32) { let n = lhs.len(); for i in 0..n { @@ -2245,7 +2259,9 @@ mod vector_mul_scalar_inplace { } mod vector_abs_inplace { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn vector_abs_inplace(this: &mut [f32]) { let n = this.len(); for i in 0..n { diff --git a/crates/simd/src/fast_scan.rs b/crates/simd/src/fast_scan.rs index 2a3032a2..59040fa9 100644 --- a/crates/simd/src/fast_scan.rs +++ b/crates/simd/src/fast_scan.rs @@ -456,12 +456,14 @@ mod scan { let chi = vec_srl(code, vec_splat_u8::<4>()); let lut: u8x16 = vec_xl((i as isize) * 16, lut.as_ptr().cast()); - let res_lo = vec_revb(transmute::(vec_perm(lut, lut, clo))); + let res_lo_r = transmute::(vec_perm(lut, lut, clo)); + let res_lo = vec_revb(res_lo_r); accu_0 = vec_add(accu_0, res_lo); - accu_1 = vec_add(accu_1, vec_and(vec_rli(res_lo, 8), _00ff_u16x8)); - let res_hi = vec_revb(transmute::(vec_perm(lut, lut, chi))); + accu_1 = vec_add(accu_1, vec_and(res_lo_r, _00ff_u16x8)); + let res_hi_r = transmute::(vec_perm(lut, lut, chi)); + let res_hi = vec_revb(res_hi_r); accu_2 = vec_add(accu_2, res_hi); - accu_3 = vec_add(accu_3, vec_and(vec_rli(res_hi, 8), _00ff_u16x8)); + accu_3 = vec_add(accu_3, vec_and(res_hi_r, _00ff_u16x8)); i += 1; } @@ -469,11 +471,11 @@ mod scan { let mut result = [0_u16; 32]; - accu_0 = vec_sub(accu_0, vec_and(vec_rli(accu_1, 120), _ff00_u16x8)); + accu_0 = vec_sub(accu_0, vec_and(vec_revb(accu_1), _ff00_u16x8)); vec_xst(accu_0, 0, result.as_mut_ptr().cast()); vec_xst(accu_1, 16, result.as_mut_ptr().cast()); - accu_2 = vec_sub(accu_2, vec_and(vec_rli(accu_3, 120), _ff00_u16x8)); + accu_2 = vec_sub(accu_2, vec_and(vec_revb(accu_3), _ff00_u16x8)); vec_xst(accu_2, 32, result.as_mut_ptr().cast()); vec_xst(accu_3, 48, result.as_mut_ptr().cast()); @@ -503,7 +505,83 @@ mod scan { } } - #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", @"z13")] + #[cfg(target_arch = "powerpc64")] + #[crate::target_cpu(enable = "p7")] + fn scan_p7(code: &[[u8; 16]], lut: &[[u8; 16]]) -> [u16; 32] { + unsafe { + // bounds checking is not enforced by compiler, so check it manually + assert_eq!(code.len(), lut.len()); + let n = code.len(); + + use std::arch::powerpc64::*; + use std::mem::transmute; + use {vector_unsigned_char as u8x16, vector_unsigned_short as u16x8}; + + let _0008_u16x8 = vec_splat_u16::<0x0008>(); + let _00ff_u16x8 = vec_splat_u16::<{ 0x00ffu8 as i8 }>(); + let _ff00_u16x8 = vec_splats(0xff00u16); + + let mut accu_0 = vec_splat_u16::<0>(); + let mut accu_1 = vec_splat_u16::<0>(); + let mut accu_2 = vec_splat_u16::<0>(); + let mut accu_3 = vec_splat_u16::<0>(); + + let mut i = 0_usize; + while i < n { + let code: u8x16 = vec_xl((i as isize) * 16, code.as_ptr().cast::()); + + let clo = vec_and(code, vec_splat_u8::<0xf>()); + let chi = vec_srl(code, vec_splat_u8::<4>()); + + let lut: u8x16 = vec_xl((i as isize) * 16, lut.as_ptr().cast::()); + let res_lo = transmute::(vec_perm(lut, lut, clo)); + accu_0 = vec_add(accu_0, res_lo); + accu_1 = vec_add(accu_1, vec_sr(res_lo, _0008_u16x8)); + let res_hi = transmute::(vec_perm(lut, lut, chi)); + accu_2 = vec_add(accu_2, res_hi); + accu_3 = vec_add(accu_3, vec_sr(res_hi, _0008_u16x8)); + + i += 1; + } + debug_assert_eq!(i, n); + + let mut result = [0_u16; 32]; + + accu_0 = vec_sub(accu_0, vec_sl(accu_1, _0008_u16x8)); + vec_xst(accu_0, 0, result.as_mut_ptr().cast()); + vec_xst(accu_1, 16, result.as_mut_ptr().cast()); + + accu_2 = vec_sub(accu_2, vec_sl(accu_3, _0008_u16x8)); + vec_xst(accu_2, 32, result.as_mut_ptr().cast()); + vec_xst(accu_3, 48, result.as_mut_ptr().cast()); + + result + } + } + + #[cfg(all(target_arch = "powerpc64", test, not(miri)))] + #[test] + fn scan_p7_test() { + if !crate::is_cpu_detected!("p7") { + println!("test {} ... skipped (p7)", module_path!()); + return; + } + for _ in 0..if cfg!(not(miri)) { 256 } else { 1 } { + for n in 90..110 { + let code = (0..n) + .map(|_| std::array::from_fn(|_| rand::random())) + .collect::>(); + let lut = (0..n) + .map(|_| std::array::from_fn(|_| rand::random())) + .collect::>(); + unsafe { + assert_eq!(scan_p7(&code, &lut), fallback(&code, &lut)); + } + } + } + } + + #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", @"z13", @"p7")] pub fn scan(code: &[[u8; 16]], lut: &[[u8; 16]]) -> [u16; 32] { assert_eq!(code.len(), lut.len()); let n = code.len(); @@ -563,7 +641,9 @@ pub fn scan(code: &[[u8; 16]], lut: &[[u8; 16]]) -> [u16; 32] { } mod accu { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn accu(sum: &mut [u32; 32], delta: &[u16; 32]) { for i in 0..32 { sum[i] += delta[i] as u32; diff --git a/crates/simd/src/fht.rs b/crates/simd/src/fht.rs index b45b13c8..84238b48 100644 --- a/crates/simd/src/fht.rs +++ b/crates/simd/src/fht.rs @@ -33,7 +33,7 @@ mod step_1 { seq_macro::seq!( Q in 0..16 { mod dispatch_~Q { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn f(x: &mut [f32]) { crate::fht::basic_1::(x); } @@ -48,7 +48,7 @@ mod step_2 { seq_macro::seq!( Q in 0..16 { mod dispatch_~Q { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn f(x: &mut [f32]) { crate::fht::basic_2::(x); } @@ -62,7 +62,7 @@ mod step_2 { macro_rules! fht { ($p:literal, 0) => { { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] fn walk(x: &mut [f32]) { assert!(x.len() == (1 << $p)); seq_macro::seq!( diff --git a/crates/simd/src/lib.rs b/crates/simd/src/lib.rs index 90f2db6c..47600f7a 100644 --- a/crates/simd/src/lib.rs +++ b/crates/simd/src/lib.rs @@ -16,6 +16,9 @@ #![cfg_attr(target_arch = "s390x", feature(stdarch_s390x_feature_detection))] #![cfg_attr(target_arch = "s390x", feature(s390x_target_feature))] #![cfg_attr(target_arch = "s390x", feature(stdarch_s390x))] +#![cfg_attr(target_arch = "powerpc64", feature(stdarch_powerpc_feature_detection))] +#![cfg_attr(target_arch = "powerpc64", feature(powerpc_target_feature))] +#![cfg_attr(target_arch = "powerpc64", feature(stdarch_powerpc))] mod aligned; mod emulate; @@ -73,6 +76,9 @@ mod internal { #[cfg(target_arch = "s390x")] simd_macros::define_is_cpu_detected!("s390x"); + #[cfg(target_arch = "powerpc64")] + simd_macros::define_is_cpu_detected!("powerpc64"); + #[cfg(target_arch = "x86_64")] #[allow(unused_imports)] pub use is_x86_64_cpu_detected; @@ -85,6 +91,10 @@ mod internal { #[allow(unused_imports)] pub use is_s390x_cpu_detected; + #[cfg(target_arch = "powerpc64")] + #[allow(unused_imports)] + pub use is_powerpc64_cpu_detected; + #[cfg(target_arch = "x86_64")] pub fn is_v4_detected() -> bool { std::arch::is_x86_feature_detected!("avx512bw") @@ -213,6 +223,32 @@ mod internal { pub fn is_z13_detected() -> bool { std::arch::is_s390x_feature_detected!("vector") } + + #[cfg(target_arch = "powerpc64")] + pub fn is_p9_detected() -> bool { + std::arch::is_powerpc64_feature_detected!("altivec") + && std::arch::is_powerpc64_feature_detected!("vsx") + && std::arch::is_powerpc64_feature_detected!("power8-altivec") + && std::arch::is_powerpc64_feature_detected!("power8-crypto") + && std::arch::is_powerpc64_feature_detected!("power8-vector") + && std::arch::is_powerpc64_feature_detected!("power9-altivec") + && std::arch::is_powerpc64_feature_detected!("power9-vector") + } + + #[cfg(target_arch = "powerpc64")] + pub fn is_p8_detected() -> bool { + std::arch::is_powerpc64_feature_detected!("altivec") + && std::arch::is_powerpc64_feature_detected!("vsx") + && std::arch::is_powerpc64_feature_detected!("power8-altivec") + && std::arch::is_powerpc64_feature_detected!("power8-crypto") + && std::arch::is_powerpc64_feature_detected!("power8-vector") + } + + #[cfg(target_arch = "powerpc64")] + pub fn is_p7_detected() -> bool { + std::arch::is_powerpc64_feature_detected!("altivec") + && std::arch::is_powerpc64_feature_detected!("vsx") + } } pub use simd_macros::{multiversion, target_cpu}; @@ -229,6 +265,10 @@ pub use std::arch::is_aarch64_feature_detected as is_feature_detected; #[allow(unused_imports)] pub use std::arch::is_s390x_feature_detected as is_feature_detected; +#[cfg(target_arch = "powerpc64")] +#[allow(unused_imports)] +pub use std::arch::is_powerpc64_feature_detected as is_feature_detected; + #[cfg(target_arch = "x86_64")] #[allow(unused_imports)] pub use internal::is_x86_64_cpu_detected as is_cpu_detected; @@ -240,3 +280,7 @@ pub use internal::is_aarch64_cpu_detected as is_cpu_detected; #[cfg(target_arch = "s390x")] #[allow(unused_imports)] pub use internal::is_s390x_cpu_detected as is_cpu_detected; + +#[cfg(target_arch = "powerpc64")] +#[allow(unused_imports)] +pub use internal::is_powerpc64_cpu_detected as is_cpu_detected; diff --git a/crates/simd/src/quantize.rs b/crates/simd/src/quantize.rs index eb87baaf..24e99830 100644 --- a/crates/simd/src/quantize.rs +++ b/crates/simd/src/quantize.rs @@ -283,7 +283,7 @@ mod mul_add_round { } } - #[crate::multiversion(@"v4", @"v3", @"v2:fma", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2:fma", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn mul_add_round(this: &[f32], k: f32, b: f32) -> Vec { let n = this.len(); let mut r = Vec::::with_capacity(n); diff --git a/crates/simd/src/rotate.rs b/crates/simd/src/rotate.rs index 9bf99ca7..795ea28c 100644 --- a/crates/simd/src/rotate.rs +++ b/crates/simd/src/rotate.rs @@ -13,7 +13,9 @@ // Copyright (c) 2025 TensorChord Inc. pub mod givens { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn givens(lhs: &mut [f32], rhs: &mut [f32]) { assert!(lhs.len() == rhs.len()); let n = lhs.len(); @@ -29,7 +31,9 @@ pub fn givens(lhs: &mut [f32], rhs: &mut [f32]) { } pub mod flip { - #[crate::multiversion("v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion( + "v4", "v3", "v2", "a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7" + )] pub fn flip(bits: &[u64; 1024], result: &mut [f32]) { use std::hint::select_unpredictable; let result: &mut [u32] = zerocopy::transmute_mut!(result); diff --git a/crates/simd/src/u8.rs b/crates/simd/src/u8.rs index b21145bd..628bedf3 100644 --- a/crates/simd/src/u8.rs +++ b/crates/simd/src/u8.rs @@ -311,7 +311,7 @@ mod reduce_sum_of_x_as_u32_y_as_u32 { } } - #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_x_as_u32_y_as_u32(s: &[u8], t: &[u8]) -> u32 { assert_eq!(s.len(), t.len()); let n = s.len(); @@ -517,7 +517,7 @@ mod reduce_sum_of_x_as_u16 { } } - #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_x_as_u16(this: &[u8]) -> u16 { let n = this.len(); let mut sum = 0; @@ -722,7 +722,7 @@ mod reduce_sum_of_x_as_u32 { } } - #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", "z17", "z16", "z15", "z14", "z13")] + #[crate::multiversion(@"v4", @"v3", @"v2", @"a2", "z17", "z16", "z15", "z14", "z13", "p9", "p8", "p7")] pub fn reduce_sum_of_x_as_u32(this: &[u8]) -> u32 { let n = this.len(); let mut sum = 0; diff --git a/crates/simd_macros/src/target.rs b/crates/simd_macros/src/target.rs index 4030c5f4..ba94cf60 100644 --- a/crates/simd_macros/src/target.rs +++ b/crates/simd_macros/src/target.rs @@ -124,4 +124,33 @@ pub const TARGET_CPUS: &[TargetCpu] = &[ target_arch: "s390x", target_features: &["vector"], }, + TargetCpu { + target_cpu: "p9", + target_arch: "powerpc64", + target_features: &[ + "altivec", + "vsx", + "power8-altivec", + "power8-crypto", + "power8-vector", + "power9-altivec", + "power9-vector", + ], + }, + TargetCpu { + target_cpu: "p8", + target_arch: "powerpc64", + target_features: &[ + "altivec", + "vsx", + "power8-altivec", + "power8-crypto", + "power8-vector", + ], + }, + TargetCpu { + target_cpu: "p7", + target_arch: "powerpc64", + target_features: &["altivec", "vsx"], + }, ];