Skip to content

Commit

Permalink
update to latest
Browse files Browse the repository at this point in the history
  • Loading branch information
toxieainc committed Mar 14, 2024
1 parent 35deb1f commit 07ea26e
Showing 1 changed file with 76 additions and 43 deletions.
119 changes: 76 additions & 43 deletions ext/sse2neon.h
@@ -1,6 +1,30 @@
#ifndef SSE2NEON_H
#define SSE2NEON_H

/*
* sse2neon is freely redistributable under the MIT License.
*
* Copyright (c) 2015-2024 SSE2NEON Contributors.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

// This header file provides a simple API translation layer
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
//
Expand Down Expand Up @@ -28,28 +52,6 @@
// Aymen Qader <aymen.qader@arm.com>
// Anthony Roberts <anthony.roberts@linaro.org>

/*
* sse2neon is freely redistributable under the MIT License.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

/* Tunable configurations */

/* Enable precise implementation of math operations
Expand Down Expand Up @@ -145,19 +147,6 @@
(defined(_M_ARM64) || defined(__arm64__))
#define SSE2NEON_HAS_BITSCAN64
#endif

#ifndef ARM64_SYSREG
#define ARM64_SYSREG(op0, op1, crn, crm, op2) \
( ((op0 & 1) << 14) | \
((op1 & 7) << 11) | \
((crn & 15) << 7) | \
((crm & 15) << 3) | \
((op2 & 7) << 0) )
#endif

#ifndef ARM64_FPCR
#define ARM64_FPCR ARM64_SYSREG(3, 3, 4, 4, 0) // Floating point control register (EL0)
#endif
#endif

#if defined(__GNUC__) || defined(__clang__)
Expand Down Expand Up @@ -1733,7 +1722,7 @@ FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
{
#if defined(__aarch64__) || defined(_M_ARM64)
#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV
return vreinterpretq_m128_f32(
vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
#else
Expand Down Expand Up @@ -2310,6 +2299,10 @@ FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
{
float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
#if SSE2NEON_PRECISE_DIV
// Additional Netwon-Raphson iteration for accuracy
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
#endif
return vreinterpretq_m128_f32(recip);
}

Expand Down Expand Up @@ -2342,6 +2335,11 @@ FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)

out = vmulq_f32(
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
#if SSE2NEON_PRECISE_SQRT
// Additional Netwon-Raphson iteration for accuracy
out = vmulq_f32(
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
#endif

// Set output vector element to infinity/negative-infinity if
// the corresponding input vector element is 0.0f/-0.0f.
Expand Down Expand Up @@ -2657,7 +2655,7 @@ FORCE_INLINE void _mm_lfence(void)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
{
#if defined(__aarch64__) || defined(_M_ARM64)
#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
#else
float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
Expand Down Expand Up @@ -8502,12 +8500,47 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
crc = __crc32cb(crc, v);
#else
crc ^= v;
for (int bit = 0; bit < 8; bit++) {
if (crc & 1)
crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
else
crc = (crc >> 1);
}
#if defined(__ARM_FEATURE_CRYPTO)
// Adapted from: https://mary.rs/lab/crc32/
// Barrent reduction
uint64x2_t orig =
vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
uint64x2_t tmp = orig;

// Polynomial P(x) of CRC32C
uint64_t p = 0x105EC76F1;
// Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
// 2^{64} / P(x) \rfloor = 0x11f91caf6
uint64_t mu = 0x1dea713f1;

// Multiply by mu_{64}
tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
// Divide by 2^{64} (mask away the unnecessary bits)
tmp =
vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
// Multiply by P(x) (shifted left by 1 for alignment reasons)
tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
// Subtract original from result
tmp = veorq_u64(tmp, orig);

// Extract the 'lower' (in bit-reflected sense) 32 bits
crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
#else // Fall back to the generic table lookup approach
// Adapted from: https://create.stephan-brumme.com/crc32/
// Apply half-byte comparision algorithm for the best ratio between
// performance and lookup table.

// The lookup table just needs to store every 16th entry
// of the standard look-up table.
static const uint32_t crc32_half_byte_tbl[] = {
0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
};

crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
#endif
#endif
return crc;
}
Expand Down

0 comments on commit 07ea26e

Please sign in to comment.