@@ -66,9 +66,9 @@ public static void Shuffle4Reduce(
66
66
ref Span < float > destination ,
67
67
[ ConstantExpected ] byte control )
68
68
{
69
- if ( ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeFloat ) ||
70
- ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeFloat ) ||
71
- Vector128 . IsHardwareAccelerated )
69
+ if ( Vector512 . IsHardwareAccelerated ||
70
+ Vector256 . IsHardwareAccelerated ||
71
+ Vector128 . IsHardwareAccelerated )
72
72
{
73
73
int remainder = 0 ;
74
74
if ( Vector512 . IsHardwareAccelerated )
@@ -112,9 +112,9 @@ public static void Shuffle4Reduce(
112
112
ref Span < byte > destination ,
113
113
[ ConstantExpected ] byte control )
114
114
{
115
- if ( ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeByte ) ||
116
- ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeByte ) ||
117
- ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte ) )
115
+ if ( Vector512 . IsHardwareAccelerated ||
116
+ Vector256 . IsHardwareAccelerated ||
117
+ Vector128 . IsHardwareAccelerated )
118
118
{
119
119
int remainder = 0 ;
120
120
if ( Vector512 . IsHardwareAccelerated )
@@ -158,7 +158,7 @@ public static void Shuffle3Reduce(
158
158
ref Span < byte > destination ,
159
159
[ ConstantExpected ] byte control )
160
160
{
161
- if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte && Vector128_ . SupportsAlignRight )
161
+ if ( Vector128 . IsHardwareAccelerated )
162
162
{
163
163
int remainder = source . Length % ( Vector128 < byte > . Count * 3 ) ;
164
164
@@ -190,7 +190,7 @@ public static void Pad3Shuffle4Reduce(
190
190
ref Span < byte > destination ,
191
191
[ ConstantExpected ] byte control )
192
192
{
193
- if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte && Vector128_ . SupportsShiftByte )
193
+ if ( Vector128 . IsHardwareAccelerated )
194
194
{
195
195
int remainder = source . Length % ( Vector128 < byte > . Count * 3 ) ;
196
196
@@ -223,7 +223,7 @@ public static void Shuffle4Slice3Reduce(
223
223
ref Span < byte > destination ,
224
224
[ ConstantExpected ] byte control )
225
225
{
226
- if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte && Vector128_ . SupportsShiftByte )
226
+ if ( Vector128 . IsHardwareAccelerated )
227
227
{
228
228
int remainder = source . Length & ( ( Vector128 < byte > . Count * 4 ) - 1 ) ; // bit-hack for modulo
229
229
@@ -249,7 +249,7 @@ private static void Shuffle4(
249
249
Span < float > destination ,
250
250
[ ConstantExpected ] byte control )
251
251
{
252
- if ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeFloat )
252
+ if ( Vector512 . IsHardwareAccelerated )
253
253
{
254
254
ref Vector512 < float > sourceBase = ref Unsafe . As < float , Vector512 < float > > ( ref MemoryMarshal . GetReference ( source ) ) ;
255
255
ref Vector512 < float > destinationBase = ref Unsafe . As < float , Vector512 < float > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
@@ -277,7 +277,7 @@ private static void Shuffle4(
277
277
}
278
278
}
279
279
}
280
- else if ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeFloat )
280
+ else if ( Vector256 . IsHardwareAccelerated )
281
281
{
282
282
ref Vector256 < float > sourceBase = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( source ) ) ;
283
283
ref Vector256 < float > destinationBase = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
@@ -341,7 +341,7 @@ private static void Shuffle4(
341
341
Span < byte > destination ,
342
342
[ ConstantExpected ] byte control )
343
343
{
344
- if ( Vector512 . IsHardwareAccelerated && Vector512_ . SupportsShuffleNativeByte )
344
+ if ( Vector512 . IsHardwareAccelerated )
345
345
{
346
346
Span < byte > temp = stackalloc byte [ Vector512 < byte > . Count ] ;
347
347
Shuffle . MMShuffleSpan ( ref temp , control ) ;
@@ -373,8 +373,13 @@ private static void Shuffle4(
373
373
}
374
374
}
375
375
}
376
- else if ( Vector256 . IsHardwareAccelerated && Vector256_ . SupportsShuffleNativeByte )
376
+ else if ( Vector256 . IsHardwareAccelerated )
377
377
{
378
+ // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
379
+ // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
380
+ // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
381
+ // for indexing within each lane, and ignores the upper bits unless bit 7 is set,
382
+ // this usage is guaranteed to remain within-lane and non-zeroing.
378
383
Span < byte > temp = stackalloc byte [ Vector256 < byte > . Count ] ;
379
384
Shuffle . MMShuffleSpan ( ref temp , control ) ;
380
385
Vector256 < byte > mask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( temp ) ) ;
@@ -391,21 +396,21 @@ private static void Shuffle4(
391
396
ref Vector256 < byte > vs0 = ref Unsafe . Add ( ref sourceBase , i ) ;
392
397
ref Vector256 < byte > vd0 = ref Unsafe . Add ( ref destinationBase , i ) ;
393
398
394
- vd0 = Vector256_ . ShuffleNative ( vs0 , mask ) ;
395
- Unsafe . Add ( ref vd0 , ( nuint ) 1 ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref vs0 , ( nuint ) 1 ) , mask ) ;
396
- Unsafe . Add ( ref vd0 , ( nuint ) 2 ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref vs0 , ( nuint ) 2 ) , mask ) ;
397
- Unsafe . Add ( ref vd0 , ( nuint ) 3 ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref vs0 , ( nuint ) 3 ) , mask ) ;
399
+ vd0 = Vector256_ . ShufflePerLane ( vs0 , mask ) ;
400
+ Unsafe . Add ( ref vd0 , ( nuint ) 1 ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref vs0 , ( nuint ) 1 ) , mask ) ;
401
+ Unsafe . Add ( ref vd0 , ( nuint ) 2 ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref vs0 , ( nuint ) 2 ) , mask ) ;
402
+ Unsafe . Add ( ref vd0 , ( nuint ) 3 ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref vs0 , ( nuint ) 3 ) , mask ) ;
398
403
}
399
404
400
405
if ( m > 0 )
401
406
{
402
407
for ( nuint i = u ; i < n ; i ++ )
403
408
{
404
- Unsafe . Add ( ref destinationBase , i ) = Vector256_. ShuffleNative ( Unsafe . Add ( ref sourceBase , i ) , mask ) ;
409
+ Unsafe . Add ( ref destinationBase , i ) = Vector256_. ShufflePerLane ( Unsafe . Add ( ref sourceBase , i ) , mask ) ;
405
410
}
406
411
}
407
412
}
408
- else if ( Vector128 . IsHardwareAccelerated && Vector128_ . SupportsShuffleNativeByte )
413
+ else if ( Vector128 . IsHardwareAccelerated )
409
414
{
410
415
Span < byte > temp = stackalloc byte [ Vector128 < byte > . Count ] ;
411
416
Shuffle . MMShuffleSpan ( ref temp , control ) ;
@@ -445,9 +450,7 @@ private static void Shuffle3(
445
450
Span < byte > destination ,
446
451
[ ConstantExpected ] byte control )
447
452
{
448
- if ( Vector128 . IsHardwareAccelerated &&
449
- Vector128_ . SupportsShuffleNativeByte &&
450
- Vector128_ . SupportsAlignRight )
453
+ if ( Vector128 . IsHardwareAccelerated )
451
454
{
452
455
Vector128 < byte > maskPad4Nx16 = ShuffleMaskPad4Nx16 ( ) ;
453
456
Vector128 < byte > maskSlice4Nx16 = ShuffleMaskSlice4Nx16 ( ) ;
@@ -507,10 +510,7 @@ private static void Pad3Shuffle4(
507
510
Span < byte > destination ,
508
511
[ ConstantExpected ] byte control )
509
512
{
510
- if ( Vector128 . IsHardwareAccelerated &&
511
- Vector128_ . SupportsShuffleNativeByte &&
512
- Vector128_ . SupportsShiftByte &&
513
- Vector128_ . SupportsAlignRight )
513
+ if ( Vector128 . IsHardwareAccelerated )
514
514
{
515
515
Vector128 < byte > maskPad4Nx16 = ShuffleMaskPad4Nx16 ( ) ;
516
516
Vector128 < byte > fill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
@@ -553,10 +553,7 @@ private static void Shuffle4Slice3(
553
553
Span < byte > destination ,
554
554
[ ConstantExpected ] byte control )
555
555
{
556
- if ( Vector128 . IsHardwareAccelerated &&
557
- Vector128_ . SupportsShuffleNativeByte &&
558
- Vector128_ . SupportsShiftByte &&
559
- Vector128_ . SupportsAlignRight )
556
+ if ( Vector128 . IsHardwareAccelerated )
560
557
{
561
558
Vector128 < byte > maskSlice4Nx16 = ShuffleMaskSlice4Nx16 ( ) ;
562
559
Vector128 < byte > maskE = Vector128_ . AlignRight ( maskSlice4Nx16 , maskSlice4Nx16 , 12 ) ;
0 commit comments