-
-
Notifications
You must be signed in to change notification settings - Fork 176
/
Copy pathsimd_transform_sse2.c
689 lines (593 loc) · 26.8 KB
/
simd_transform_sse2.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
#include "simd_transform.h"
#if PG_ENABLE_ARM_NEON
// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon
#include "include/sse2neon.h"
#endif /* PG_ENABLE_ARM_NEON */
/* This returns 1 when sse2 is available at runtime but support for it isn't
* compiled in, 0 in all other cases */
int
pg_sse2_at_runtime_but_uncompiled()
{
if (SDL_HasSSE2()) {
#ifdef __SSE2__
return 0;
#else
return 1;
#endif /* __SSE2__ */
}
return 0;
}
/* This returns 1 when neon is available at runtime but support for it isn't
* compiled in, 0 in all other cases */
int
pg_neon_at_runtime_but_uncompiled()
{
if (SDL_HasNEON()) {
#if PG_ENABLE_ARM_NEON
return 0;
#else
return 1;
#endif /* PG_ENABLE_ARM_NEON */
}
return 0;
}
#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))
// For some reason this is not defined on some non Windows compilers
#define _pg_loadu_si32(p) _mm_cvtsi32_si128(*(unsigned int const *)(p))
#define _pg_loadu_si64(p) _mm_loadl_epi64((__m128i const *)(p))
#define _pg_storeu_si32(p, a) (void)(*(int *)(p) = _mm_cvtsi128_si32((a)))
#define _pg_storeu_si64(p, a) (_mm_storel_epi64((__m128i *)(p), (a)))
#if defined(ENV64BIT)
#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num)
#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(reg)
#else
#define LOAD_64_INTO_M128(num, reg) \
*reg = _mm_loadl_epi64((const __m128i *)num)
#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, reg)
#endif
void
filter_shrink_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
int dstpitch, int srcwidth, int dstwidth)
{
// This filter run through multiple pixels in a row at once, since it
// accumulates then writes -- the pixels in a row are not independent.
// However, this accumulate/write cycle is the same for each row, so
// multiple rows can be run at once, saving about 37% X-shrink runtime
// in my testing.
// These need to skip remaining bytes in a row, plus the entire next
// row, with the alternating row strategy. The redundancy in the equations
// makes them more clear, so I left it.
int srcdiff = srcpitch - (srcwidth * 4) + srcpitch;
int dstdiff = dstpitch - (dstwidth * 4) + dstpitch;
Uint8 *srcpix2 = srcpix + srcpitch;
Uint8 *dstpix2 = dstpix + dstpitch;
int x, y;
__m128i src, src2, dst, accumulate, mm_xcounter, mm_xfrac;
int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
__m128i xrecip = _mm_set1_epi16((Uint16)(0x40000000 / xspace));
for (y = 0; y < height; y += 2) {
accumulate = _mm_setzero_si128();
int xcounter = xspace;
// Prevent overwrite over final row of pixels when the surface height
// is odd (as in not even)
if (y == height - 1) {
srcpix2 = srcpix;
dstpix2 = dstpix;
}
for (x = 0; x < srcwidth; x++) {
if (xcounter > 0x04000) {
// Load a pixel from two separate lines at once
// Unpack RGBA into 16 bit lanes
src = _mm_unpacklo_epi8(_pg_loadu_si32(srcpix),
_mm_setzero_si128());
src2 = _mm_unpacklo_epi8(_pg_loadu_si32(srcpix2),
_mm_setzero_si128());
// Combine the two expanded pixels
src = _mm_unpacklo_epi64(src, src2);
// Accumulate[127:64] tracks the srcpix2 pixel line,
// [63:0] tracks the srcpix pixel line
accumulate = _mm_add_epi16(accumulate, src);
srcpix += 4;
srcpix2 += 4;
xcounter -= 0x04000;
}
/* write out a destination pixel */
else {
int xfrac = 0x04000 - xcounter;
// Broadcast variables into intrinsics
mm_xcounter = _mm_set1_epi16(xcounter);
mm_xfrac = _mm_set1_epi16(xfrac);
// Load a pixel from two separate lines at once
// Unpack RGBA into 16 bit lanes
src = _mm_unpacklo_epi8(_pg_loadu_si32(srcpix),
_mm_setzero_si128());
src2 = _mm_unpacklo_epi8(_pg_loadu_si32(srcpix2),
_mm_setzero_si128());
// Combine the two expanded pixels
src = _mm_unpacklo_epi64(src, src2);
// The operation! Translated from old filter_shrink_X_SSE
// assembly. I don't understand the equivalence between
// these operations and the C version of these operations,
// but it works.
src = _mm_slli_epi16(src, 2);
dst = _mm_mulhi_epu16(src, mm_xcounter);
dst = _mm_add_epi16(dst, accumulate);
accumulate = _mm_mulhi_epu16(src, mm_xfrac);
dst = _mm_mulhi_epu16(dst, xrecip);
// Pack and store results. Once packed, both pixel results are
// in 64 bits. First 32 is stored at dstpix, next 32 is stored
// at dstpix2.
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
_pg_storeu_si32(dstpix, dst);
_pg_storeu_si32(dstpix2, _mm_srli_si128(dst, 4));
dstpix += 4;
dstpix2 += 4;
srcpix += 4;
srcpix2 += 4;
xcounter = xspace - xfrac;
}
}
srcpix += srcdiff;
srcpix2 += srcdiff;
dstpix += dstdiff;
dstpix2 += dstdiff;
}
}
void
filter_shrink_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
int dstpitch, int srcheight, int dstheight)
{
// This filter also iterates over Y and then over X, but unlike
// filter_shrink_X_SSE2 it can be parallelized in the X direction because
// each pixel in a row is independent from the algorithm's perspective.
// Going from SSE2 single pixel to SSE2 double pixel in this saved about
// 25% Y-shrink runtime in my testing.
int srcdiff = srcpitch - (width * 4);
int dstdiff = dstpitch - (width * 4);
int x, y;
__m128i src, dst, mm_acc, mm_yfrac, mm_ycounter;
int during_2_width = width / 2;
int post_2_width = width % 2;
int yspace = 0x04000 * srcheight / dstheight; /* must be > 1 */
__m128i yrecip = _mm_set1_epi16(0x40000000 / yspace);
int ycounter = yspace;
Uint16 *templine;
/* allocate a clear memory area for storing the accumulator line */
// Future: when we support SDL 2.0.10 and up, we can use SDL_SIMDAlloc
// here so accumulate load/stores can be aligned, for a small perf
// improvement.
templine = (Uint16 *)calloc(dstpitch, 2);
if (templine == NULL) {
return;
}
for (y = 0; y < srcheight; y++) {
Uint16 *accumulate = templine;
if (ycounter > 0x04000) {
// Loads up the whole pixel row in 16 bit lanes and adds to
// existing data in accumulate/templine.
for (x = 0; x < during_2_width; x++) {
src = _mm_unpacklo_epi8(_pg_loadu_si64(srcpix),
_mm_setzero_si128());
_mm_storeu_si128(
(__m128i *)accumulate,
_mm_add_epi16(_mm_loadu_si128((const __m128i *)accumulate),
src));
accumulate += 8; // 8 Uint16s, so 16 bytes
srcpix += 8; // 8 Uint8s, so 8 bytes (2 pixels)
}
if (post_2_width) { // either 0 or 1, no need for second for loop
src = _mm_unpacklo_epi8(_pg_loadu_si32(srcpix),
_mm_setzero_si128());
_pg_storeu_si64(
accumulate,
_mm_add_epi16(_pg_loadu_si64(accumulate), src));
accumulate += 4; // 4 Uint16s, so 8 bytes
srcpix += 4; // 4 bytes (1 pixel)
}
ycounter -= 0x04000;
}
else {
// Calculates and tracks variables in C then broadcasts them
// to intrinsics when needed for calculations.
int yfrac = 0x04000 - ycounter;
mm_yfrac = _mm_set1_epi16(yfrac);
mm_ycounter = _mm_set1_epi16(ycounter);
/* write out a destination line */
for (x = 0; x < during_2_width; x++) {
src = _mm_unpacklo_epi8(_pg_loadu_si64(srcpix),
_mm_setzero_si128());
srcpix += 8; // 8 bytes
mm_acc = _mm_loadu_si128((const __m128i *)accumulate);
src = _mm_slli_epi16(src, 2);
dst = _mm_mulhi_epu16(src, mm_yfrac);
src = _mm_mulhi_epu16(src, mm_ycounter);
_mm_storeu_si128((__m128i *)accumulate, dst);
accumulate += 8; // 4 Uint16s, so 8 bytes
dst = _mm_add_epi16(src, mm_acc);
dst = _mm_mulhi_epu16(dst, yrecip);
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
_pg_storeu_si64(dstpix, dst);
dstpix += 8; // 8 bytes
}
if (post_2_width) { // either 0 or 1, no need for second for loop
src = _mm_unpacklo_epi8(_pg_loadu_si32(srcpix),
_mm_setzero_si128());
srcpix += 4;
mm_acc = _pg_loadu_si64(accumulate);
src = _mm_slli_epi16(src, 2);
dst = _mm_mulhi_epu16(src, mm_yfrac);
src = _mm_mulhi_epu16(src, mm_ycounter);
_pg_storeu_si64(accumulate, dst);
// accumulate doesn't need to be incremented here because
// it is reassigned at the top of the loop.
dst = _mm_add_epi16(src, mm_acc);
dst = _mm_mulhi_epu16(dst, yrecip);
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
_pg_storeu_si32(dstpix, dst);
dstpix += 4;
}
dstpix += dstdiff;
ycounter = yspace - yfrac;
}
srcpix += srcdiff;
}
/* free the temporary memory */
free(templine);
}
void
filter_expand_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
int dstpitch, int srcwidth, int dstwidth)
{
int dstdiff = dstpitch - (dstwidth * 4);
int *xidx0, *xmult_combined;
int x, y;
const int factorwidth = 8;
// Inherited this from the ONLYC variant, maybe can be removed/
#ifdef _MSC_VER
/* Make MSVC static analyzer happy by assuring dstwidth >= 2 to suppress
* a false analyzer report */
__analysis_assume(dstwidth >= 2);
#endif
/* Allocate memory for factors */
xidx0 = malloc(dstwidth * 4);
if (xidx0 == 0)
return;
// This algorithm uses two multipliers, xm0 and xm1. Each multiplier
// gets 32 bits of space, so this gives 64 bits per dstwidth.
xmult_combined = (int *)malloc(dstwidth * factorwidth);
if (xmult_combined == 0) {
free(xidx0);
return;
}
/* Create multiplier factors and starting indices and put them in arrays */
for (x = 0; x < dstwidth; x++) {
// Could it be worth it to reduce the fixed point there to fit
// inside 16 bits (0xFF), and then pack xidx0 in with mult factors?
int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
int xm0 = 0x100 - xm1;
xidx0[x] = x * (srcwidth - 1) / dstwidth;
// packs xm0 and xm1 scaling factors into a combined array, for easy
// loading
xmult_combined[x * 2] = xm0 | (xm0 << 16);
xmult_combined[x * 2 + 1] = xm1 | (xm1 << 16);
}
__m128i src, multcombined, dst;
/* Do the scaling in raster order so we don't trash the cache */
for (y = 0; y < height; y++) {
Uint8 *srcrow0 = srcpix + y * srcpitch;
for (x = 0; x < dstwidth; x++) {
Uint8 *src_p =
srcrow0 + xidx0[x] * 4; // *8 now because of factorwidth?
src =
_mm_unpacklo_epi8(_pg_loadu_si64(src_p), _mm_setzero_si128());
// uses combined multipliers against 2 src pixels
// xm0 against src[0-3] (1 px), and xm1 against src[4-7] (1 px)
multcombined = _mm_shuffle_epi32(
_pg_loadu_si64(xmult_combined + x * 2), 0b01010000);
src = _mm_mullo_epi16(src, multcombined);
// shift over pixel 2 results and add with pixel 1 results
dst = _mm_add_epi16(src, _mm_bsrli_si128(src, 8));
// pack results and store destination pixel.
dst =
_mm_packus_epi16(_mm_srli_epi16(dst, 8), _mm_setzero_si128());
_pg_storeu_si32(dstpix, dst);
dstpix += 4;
}
dstpix += dstdiff;
}
/* free memory */
free(xidx0);
free(xmult_combined);
}
void
filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
int dstpitch, int srcheight, int dstheight)
{
// This filter parallelizes math operations using SSE2, but it also runs
// through each row 2 pixels at a time. The 2x pixels at a time strategy
// was a 23% performance improvement for Y-expand over 1x at a time.
int x, y;
__m128i src0, src1, dst, ymult0_mm, ymult1_mm;
// For some reason the C implementation does not have this, is that ok?
int dstdiff = dstpitch - (width * 4);
int during_2_width = width / 2;
int post_2_width = width % 2;
for (y = 0; y < dstheight; y++) {
int yidx0 = y * (srcheight - 1) / dstheight;
Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
Uint8 *srcrow1 = srcrow0 + srcpitch;
int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
int ymult0 = 0x0100 - ymult1;
ymult0_mm = _mm_set1_epi16(ymult0);
ymult1_mm = _mm_set1_epi16(ymult1);
for (x = 0; x < during_2_width; x++) {
// Load from srcrow0 and srcrow1 two pixels each, swizzled out
// into 16 bit lanes.
src0 = _mm_unpacklo_epi8(_pg_loadu_si64(srcrow0),
_mm_setzero_si128());
src1 = _mm_unpacklo_epi8(_pg_loadu_si64(srcrow1),
_mm_setzero_si128());
src0 = _mm_mullo_epi16(src0, ymult0_mm);
src1 = _mm_mullo_epi16(src1, ymult1_mm);
dst = _mm_add_epi16(src0, src1);
dst = _mm_srli_epi16(dst, 8);
// Pack down and store 2 destination pixels.
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
_pg_storeu_si64(dstpix, dst);
srcrow0 += 8; // 8 bytes (2 pixels)
srcrow1 += 8;
dstpix += 8;
}
if (post_2_width) {
// Load from srcrow0 and srcrow1 one pixel each, swizzled out
// into 16 bit lanes.
src0 = _mm_unpacklo_epi8(_pg_loadu_si32(srcrow0),
_mm_setzero_si128());
src1 = _mm_unpacklo_epi8(_pg_loadu_si32(srcrow1),
_mm_setzero_si128());
src0 = _mm_mullo_epi16(src0, ymult0_mm);
src1 = _mm_mullo_epi16(src1, ymult1_mm);
dst = _mm_add_epi16(src0, src1);
dst = _mm_srli_epi16(dst, 8);
// Pack down and store 1 destination pixel.
dst = _mm_packus_epi16(dst, _mm_setzero_si128());
_pg_storeu_si32(dstpix, dst);
srcrow0 += 4; // 4 bytes (1 pixel)
srcrow1 += 4;
dstpix += 4;
}
dstpix += dstdiff;
}
}
void
grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
{
/* For the SSE2 SIMD version of grayscale we do one pixel at a time
* Thus we can calculate the number of loops (and pixels) by multiplying
* the width of the surface to be grayscaled, by the height of that
* surface.
*
* We also need to calculate a 'skip value' in case our surface's rows are
* not contiguous in memory. For surfaces, a single row's worth of pixel
* data is always contiguous (i.e. each pixel is next to each other).
* However, a surface's rows may be separated from one another in memory,
* most commonly this happens with sub surfaces.
* The vast majority of surfaces used in applications will probably also
* have contiguous rows as that is what happens when you create a standard
* 32bit surface with pygame.Surface. SIMD Transform algorithms,
* should treat this 'most normal' case as the critical path to maximise
* performance.
*/
int s_row_skip = (src->pitch - src->w * 4) / 4;
// generate number of batches of pixels we need to loop through
int pixel_batch_length = src->w * src->h;
int num_batches = 1;
if (s_row_skip > 0) {
pixel_batch_length = src->w;
num_batches = src->h;
}
int remaining_pixels = pixel_batch_length % 2;
int perfect_2_pixels = pixel_batch_length / 2;
int perfect_2_pixels_batch_counter = perfect_2_pixels;
int remaining_pixels_batch_counter = remaining_pixels;
Uint32 *srcp = (Uint32 *)src->pixels;
Uint32 *dstp = (Uint32 *)newsurf->pixels;
Uint64 amask64 = ((Uint64)src->format->Amask) | src->format->Amask;
Uint64 rgbmask64 = ~amask64;
Uint64 rgb_weights =
((Uint64)((0x4C << src->format->Rshift) |
(0x96 << src->format->Gshift) |
(0x1D << src->format->Bshift))
<< 32) |
((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
(0x1D << src->format->Bshift));
Uint64 *srcp64 = (Uint64 *)src->pixels;
Uint64 *dstp64 = (Uint64 *)newsurf->pixels;
__m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives,
mm_rgb_weights, mm_alpha_mask, mm_rgb_mask;
mm_zero = _mm_setzero_si128();
LOAD_64_INTO_M128(&amask64, &mm_alpha_mask);
LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask);
mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF);
LOAD_64_INTO_M128(&rgb_weights, &mm_rgb_weights);
mm_rgb_weights = _mm_unpacklo_epi8(mm_rgb_weights, mm_zero);
while (num_batches--) {
perfect_2_pixels_batch_counter = perfect_2_pixels;
remaining_pixels_batch_counter = remaining_pixels;
while (perfect_2_pixels_batch_counter--) {
LOAD_64_INTO_M128(srcp64, &mm_src);
/*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/
/* First we strip out the alpha so we have one of our 4 channels
empty for the rest of the calculation */
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
/*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/
/* This is where we do the efficient 8bit 'floating point multiply'
operation of each channel by the weights - using a 16bit integer
multiply, an add and a bitshift. We use this trick repeatedly
for multiplication by a 0 to 1 value in SIMD code.
*/
mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
/*mm_src = 0x000000RR00GG00BB000000RR00GG00BB*/
mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
/*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/
mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
/*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/
mm_dst = _mm_srli_epi16(mm_dst, 8);
/*mm_dst = 0x000000RR00GG00BB000000RR00GG00BB*/
/* now we have the multiplied channels we 'shuffle them out' one
* at a time so there are four copies of red, four copies of green,
* four copies of blue etc. Then we add all these together
* so each of channels contains R+G+B.
*/
mm_dst = _mm_adds_epu8(
_mm_adds_epu8(_mm_shufflehi_epi16(
_mm_shufflelo_epi16(
mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
_PG_SIMD_SHUFFLE(0, 0, 0, 0)),
_mm_shufflehi_epi16(
_mm_shufflelo_epi16(
mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1)),
_PG_SIMD_SHUFFLE(1, 1, 1, 1))),
_mm_adds_epu8(_mm_shufflehi_epi16(
_mm_shufflelo_epi16(
mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
_PG_SIMD_SHUFFLE(2, 2, 2, 2)),
_mm_shufflehi_epi16(
_mm_shufflelo_epi16(
mm_dst, _PG_SIMD_SHUFFLE(3, 3, 3, 3)),
_PG_SIMD_SHUFFLE(3, 3, 3, 3))));
/* Gr here stands for 'Gray' as we've now added all the channels
* back together after multiplying them above.
* mm_dst = 0x0000GrGr00GrGr00GrGr00GrGr0000GrGr00GrGr00GrGr00GrGr
*/
/* The rest is just packing the grayscale back to the original
* 8bit pixel layout and adding the alpha we removed earlier back
* in again
*/
mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
/*mm_dst = 0x000000000000000000GrGrGrGrGrGr00GrGrGrGrGrGr*/
mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask);
mm_dst = _mm_or_si128(mm_dst, mm_alpha);
/*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/
STORE_M128_INTO_64(mm_dst, dstp64);
/*dstp = 0xAARRGGBB*/
srcp64++;
dstp64++;
}
srcp = (Uint32 *)srcp64;
dstp = (Uint32 *)dstp64;
if (remaining_pixels_batch_counter > 0) {
mm_src = _mm_cvtsi32_si128(*srcp);
/*mm_src = 0x000000000000000000000000AARRGGBB*/
/* First we strip out the alpha so we have one of our 4 channels
empty for the rest of the calculation */
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
/*mm_src = 0x00000000000000000000000000RRGGBB*/
/* This is where we do the efficient 8bit 'floating point multiply'
operation of each channel by the weights - using a 16bit integer
multiply, an add and a bitshift. We use this trick repeatedly
for multiplication by a 0 to 1 value in SIMD code.
*/
mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
/*mm_src = 0x0000000000000000000000RR00GG00BB*/
mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
/*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
/*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
mm_dst = _mm_srli_epi16(mm_dst, 8);
/*mm_dst = 0x0000000000000000000000RR00GG00BB*/
/* now we have the multiplied channels we 'shuffle them out' one
* at a time so there are four copies of red, four copies of green,
* four copies of blue etc. Then we add all these together
* so each of channels contains R+G+B.
*/
mm_dst = _mm_adds_epu8(
_mm_adds_epu8(
_mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
_mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1))),
_mm_adds_epu8(
_mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
_mm_shufflelo_epi16(mm_dst,
_PG_SIMD_SHUFFLE(3, 3, 3, 3))));
/* Gr here stands for 'Gray' as we've now added all the channels
* back together after multiplying them above.
* mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr
*/
/* The rest is just packing the grayscale back to the original
* 8bit pixel layout and adding the alpha we removed earlier back
* in again
*/
mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
/*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/
mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask);
mm_dst = _mm_or_si128(mm_dst, mm_alpha);
/*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
*dstp = _mm_cvtsi128_si32(mm_dst);
/*dstp = 0xAARRGGBB*/
srcp++;
dstp++;
}
srcp += s_row_skip;
srcp64 = (Uint64 *)srcp;
}
}
void
invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
{
int s_row_skip = (src->pitch - src->w * 4) / 4;
// generate number of batches of pixels we need to loop through
int pixel_batch_length = src->w * src->h;
int num_batches = 1;
if (s_row_skip > 0) {
pixel_batch_length = src->w;
num_batches = src->h;
}
int remaining_pixels = pixel_batch_length % 4;
int perfect_4_pixels = pixel_batch_length / 4;
int perfect_4_pixels_batch_counter = perfect_4_pixels;
int remaining_pixels_batch_counter = remaining_pixels;
Uint32 *srcp = (Uint32 *)src->pixels;
Uint32 *dstp = (Uint32 *)newsurf->pixels;
__m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask;
__m128i *srcp128 = (__m128i *)src->pixels;
__m128i *dstp128 = (__m128i *)newsurf->pixels;
mm_rgb_invert_mask = _mm_set1_epi32(~src->format->Amask);
mm_alpha_mask = _mm_set1_epi32(src->format->Amask);
while (num_batches--) {
perfect_4_pixels_batch_counter = perfect_4_pixels;
remaining_pixels_batch_counter = remaining_pixels;
while (perfect_4_pixels_batch_counter--) {
mm_src = _mm_loadu_si128(srcp128);
/*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
/* pull out the alpha */
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
/* do the invert */
mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
/* put the alpha back in*/
mm_dst = _mm_or_si128(mm_dst, mm_alpha);
_mm_storeu_si128(dstp128, mm_dst);
/*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
srcp128++;
dstp128++;
}
srcp = (Uint32 *)srcp128;
dstp = (Uint32 *)dstp128;
while (remaining_pixels_batch_counter--) {
mm_src = _mm_cvtsi32_si128(*srcp);
/*mm_src = 0x000000000000000000000000AARRGGBB*/
/* pull out the alpha */
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
/* do the invert */
mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
/* put the alpha back in*/
mm_dst = _mm_or_si128(mm_dst, mm_alpha);
*dstp = _mm_cvtsi128_si32(mm_dst);
/*dstp = 0xAARRGGBB*/
srcp++;
dstp++;
}
srcp += s_row_skip;
srcp128 = (__m128i *)srcp;
}
}
#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/