Skip to content

Commit 166a846

Browse files
Merge pull request #2933 from SixLabors/js/webp-arm
Add ARM support to WEBP Utilities
2 parents c82cb24 + 221aa80 commit 166a846

File tree

13 files changed

+2451
-1407
lines changed

13 files changed

+2451
-1407
lines changed

src/ImageSharp/Common/Helpers/Numerics.cs

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -884,23 +884,6 @@ public static void Accumulate(ref Vector<uint> accumulator, Vector<byte> values)
884884
accumulator += intHigh;
885885
}
886886

887-
/// <summary>
888-
/// Reduces elements of the vector into one sum.
889-
/// </summary>
890-
/// <param name="accumulator">The accumulator to reduce.</param>
891-
/// <returns>The sum of all elements.</returns>
892-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
893-
public static int ReduceSum(Vector128<int> accumulator)
894-
{
895-
// Add odd to even.
896-
Vector128<int> vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01));
897-
898-
// Add high to low.
899-
vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
900-
901-
return Sse2.ConvertToInt32(vsum);
902-
}
903-
904887
/// <summary>
905888
/// Reduces elements of the vector into one sum.
906889
/// </summary>

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 27 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ public static void Shuffle4Reduce(
6666
ref Span<float> destination,
6767
[ConstantExpected] byte control)
6868
{
69-
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
70-
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
71-
Vector128.IsHardwareAccelerated)
69+
if (Vector512.IsHardwareAccelerated ||
70+
Vector256.IsHardwareAccelerated ||
71+
Vector128.IsHardwareAccelerated)
7272
{
7373
int remainder = 0;
7474
if (Vector512.IsHardwareAccelerated)
@@ -112,9 +112,9 @@ public static void Shuffle4Reduce(
112112
ref Span<byte> destination,
113113
[ConstantExpected] byte control)
114114
{
115-
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
116-
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
117-
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
115+
if (Vector512.IsHardwareAccelerated ||
116+
Vector256.IsHardwareAccelerated ||
117+
Vector128.IsHardwareAccelerated)
118118
{
119119
int remainder = 0;
120120
if (Vector512.IsHardwareAccelerated)
@@ -158,7 +158,7 @@ public static void Shuffle3Reduce(
158158
ref Span<byte> destination,
159159
[ConstantExpected] byte control)
160160
{
161-
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
161+
if (Vector128.IsHardwareAccelerated)
162162
{
163163
int remainder = source.Length % (Vector128<byte>.Count * 3);
164164

@@ -190,7 +190,7 @@ public static void Pad3Shuffle4Reduce(
190190
ref Span<byte> destination,
191191
[ConstantExpected] byte control)
192192
{
193-
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
193+
if (Vector128.IsHardwareAccelerated)
194194
{
195195
int remainder = source.Length % (Vector128<byte>.Count * 3);
196196

@@ -223,7 +223,7 @@ public static void Shuffle4Slice3Reduce(
223223
ref Span<byte> destination,
224224
[ConstantExpected] byte control)
225225
{
226-
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
226+
if (Vector128.IsHardwareAccelerated)
227227
{
228228
int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1); // bit-hack for modulo
229229

@@ -249,7 +249,7 @@ private static void Shuffle4(
249249
Span<float> destination,
250250
[ConstantExpected] byte control)
251251
{
252-
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
252+
if (Vector512.IsHardwareAccelerated)
253253
{
254254
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
255255
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@@ -277,7 +277,7 @@ private static void Shuffle4(
277277
}
278278
}
279279
}
280-
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
280+
else if (Vector256.IsHardwareAccelerated)
281281
{
282282
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
283283
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@@ -341,7 +341,7 @@ private static void Shuffle4(
341341
Span<byte> destination,
342342
[ConstantExpected] byte control)
343343
{
344-
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
344+
if (Vector512.IsHardwareAccelerated)
345345
{
346346
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
347347
Shuffle.MMShuffleSpan(ref temp, control);
@@ -373,8 +373,13 @@ private static void Shuffle4(
373373
}
374374
}
375375
}
376-
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
376+
else if (Vector256.IsHardwareAccelerated)
377377
{
378+
// ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
379+
// MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
380+
// so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
381+
// for indexing within each lane, and ignores the upper bits unless bit 7 is set,
382+
// this usage is guaranteed to remain within-lane and non-zeroing.
378383
Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
379384
Shuffle.MMShuffleSpan(ref temp, control);
380385
Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
@@ -391,21 +396,21 @@ private static void Shuffle4(
391396
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
392397
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
393398

394-
vd0 = Vector256_.ShuffleNative(vs0, mask);
395-
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
396-
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
397-
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
399+
vd0 = Vector256_.ShufflePerLane(vs0, mask);
400+
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
401+
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
402+
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
398403
}
399404

400405
if (m > 0)
401406
{
402407
for (nuint i = u; i < n; i++)
403408
{
404-
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
409+
Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
405410
}
406411
}
407412
}
408-
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
413+
else if (Vector128.IsHardwareAccelerated)
409414
{
410415
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
411416
Shuffle.MMShuffleSpan(ref temp, control);
@@ -445,9 +450,7 @@ private static void Shuffle3(
445450
Span<byte> destination,
446451
[ConstantExpected] byte control)
447452
{
448-
if (Vector128.IsHardwareAccelerated &&
449-
Vector128_.SupportsShuffleNativeByte &&
450-
Vector128_.SupportsAlignRight)
453+
if (Vector128.IsHardwareAccelerated)
451454
{
452455
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
453456
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@@ -507,10 +510,7 @@ private static void Pad3Shuffle4(
507510
Span<byte> destination,
508511
[ConstantExpected] byte control)
509512
{
510-
if (Vector128.IsHardwareAccelerated &&
511-
Vector128_.SupportsShuffleNativeByte &&
512-
Vector128_.SupportsShiftByte &&
513-
Vector128_.SupportsAlignRight)
513+
if (Vector128.IsHardwareAccelerated)
514514
{
515515
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
516516
Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@@ -553,10 +553,7 @@ private static void Shuffle4Slice3(
553553
Span<byte> destination,
554554
[ConstantExpected] byte control)
555555
{
556-
if (Vector128.IsHardwareAccelerated &&
557-
Vector128_.SupportsShuffleNativeByte &&
558-
Vector128_.SupportsShiftByte &&
559-
Vector128_.SupportsAlignRight)
556+
if (Vector128.IsHardwareAccelerated)
560557
{
561558
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
562559
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);

0 commit comments

Comments
 (0)