@@ -23,6 +23,28 @@ Imaging
2323ImagingAlphaComposite (Imaging imDst , Imaging imSrc ) {
2424 Imaging imOut ;
2525 int x , y ;
26+ int xsize = imDst -> xsize ;
27+ __m128i mm_max_alpha = _mm_set1_epi32 (255 );
28+ __m128i mm_max_alpha2 = _mm_set1_epi32 (255 * 255 );
29+ __m128i mm_zero = _mm_setzero_si128 ();
30+ __m128i mm_half = _mm_set1_epi16 (128 );
31+ __m128i mm_get_lo = _mm_set_epi8 (
32+ -1 ,-1 , 5 ,4 , 5 ,4 , 5 ,4 , -1 ,-1 , 1 ,0 , 1 ,0 , 1 ,0 );
33+ __m128i mm_get_hi = _mm_set_epi8 (
34+ -1 ,-1 , 13 ,12 , 13 ,12 , 13 ,12 , -1 ,-1 , 9 ,8 , 9 ,8 , 9 ,8 );
35+ #if defined(__AVX2__ )
36+ __m256i vmm_max_alpha = _mm256_set1_epi32 (255 );
37+ __m256i vmm_max_alpha2 = _mm256_set1_epi32 (255 * 255 );
38+ __m256i vmm_zero = _mm256_setzero_si256 ();
39+ __m256i vmm_half = _mm256_set1_epi16 (128 );
40+ __m256i vmm_get_lo = _mm256_set_epi8 (
41+ -1 ,-1 , 5 ,4 , 5 ,4 , 5 ,4 , -1 ,-1 , 1 ,0 , 1 ,0 , 1 ,0 ,
42+ -1 ,-1 , 5 ,4 , 5 ,4 , 5 ,4 , -1 ,-1 , 1 ,0 , 1 ,0 , 1 ,0 );
43+ __m256i vmm_get_hi = _mm256_set_epi8 (
44+ -1 ,-1 , 13 ,12 , 13 ,12 , 13 ,12 , -1 ,-1 , 9 ,8 , 9 ,8 , 9 ,8 ,
45+ -1 ,-1 , 13 ,12 , 13 ,12 , 13 ,12 , -1 ,-1 , 9 ,8 , 9 ,8 , 9 ,8 );
46+ #endif
47+
2648
2749 /* Check arguments */
2850 if (!imDst || !imSrc || strcmp (imDst -> mode , "RGBA" ) ||
@@ -46,38 +68,155 @@ ImagingAlphaComposite(Imaging imDst, Imaging imSrc) {
4668 rgba8 * src = (rgba8 * )imSrc -> image [y ];
4769 rgba8 * out = (rgba8 * )imOut -> image [y ];
4870
49- for (x = 0 ; x < imDst -> xsize ; x ++ ) {
50- if (src -> a == 0 ) {
71+ x = 0 ;
72+
73+ #if defined(__AVX2__ )
74+
75+ #define MM_SHIFTDIV255_epi16 (src )\
76+ _mm256_srli_epi16(_mm256_add_epi16(src, _mm256_srli_epi16(src, 8)), 8)
77+
78+ for (; x < xsize - 7 ; x += 8 ) {
79+ __m256i mm_dst , mm_dst_lo , mm_dst_hi ;
80+ __m256i mm_src , mm_src_lo , mm_src_hi ;
81+ __m256i mm_dst_a , mm_src_a , mm_out_a , mm_blend ;
82+ __m256i mm_coef1 , mm_coef2 , mm_out_lo , mm_out_hi ;
83+
84+ mm_dst = _mm256_loadu_si256 ((__m256i * ) & dst [x ]);
85+ mm_dst_lo = _mm256_unpacklo_epi8 (mm_dst , vmm_zero );
86+ mm_dst_hi = _mm256_unpackhi_epi8 (mm_dst , vmm_zero );
87+ mm_src = _mm256_loadu_si256 ((__m256i * ) & src [x ]);
88+ mm_src_lo = _mm256_unpacklo_epi8 (mm_src , vmm_zero );
89+ mm_src_hi = _mm256_unpackhi_epi8 (mm_src , vmm_zero );
90+
91+ mm_dst_a = _mm256_srli_epi32 (mm_dst , 24 );
92+ mm_src_a = _mm256_srli_epi32 (mm_src , 24 );
93+
94+ // Compute coefficients
95+ // blend = dst->a * (255 - src->a); 16 bits
96+ mm_blend = _mm256_mullo_epi16 (mm_dst_a , _mm256_sub_epi32 (vmm_max_alpha , mm_src_a ));
97+ // outa = src->a * 255 + dst->a * (255 - src->a); 16 bits
98+ mm_out_a = _mm256_add_epi32 (_mm256_mullo_epi16 (mm_src_a , vmm_max_alpha ), mm_blend );
99+ mm_coef1 = _mm256_mullo_epi32 (mm_src_a , vmm_max_alpha2 );
100+ // 8 bits
101+ mm_coef1 = _mm256_cvtps_epi32 (
102+ _mm256_mul_ps (_mm256_cvtepi32_ps (mm_coef1 ),
103+ _mm256_rcp_ps (_mm256_cvtepi32_ps (mm_out_a )))
104+ );
105+ // 8 bits
106+ mm_coef2 = _mm256_sub_epi32 (vmm_max_alpha , mm_coef1 );
107+
108+ mm_out_lo = _mm256_add_epi16 (
109+ _mm256_mullo_epi16 (mm_src_lo , _mm256_shuffle_epi8 (mm_coef1 , vmm_get_lo )),
110+ _mm256_mullo_epi16 (mm_dst_lo , _mm256_shuffle_epi8 (mm_coef2 , vmm_get_lo )));
111+ mm_out_lo = _mm256_or_si256 (mm_out_lo , _mm256_slli_epi64 (
112+ _mm256_unpacklo_epi32 (mm_out_a , vmm_zero ), 48 ));
113+ mm_out_lo = _mm256_add_epi16 (mm_out_lo , vmm_half );
114+ mm_out_lo = MM_SHIFTDIV255_epi16 (mm_out_lo );
115+
116+ mm_out_hi = _mm256_add_epi16 (
117+ _mm256_mullo_epi16 (mm_src_hi , _mm256_shuffle_epi8 (mm_coef1 , vmm_get_hi )),
118+ _mm256_mullo_epi16 (mm_dst_hi , _mm256_shuffle_epi8 (mm_coef2 , vmm_get_hi )));
119+ mm_out_hi = _mm256_or_si256 (mm_out_hi , _mm256_slli_epi64 (
120+ _mm256_unpackhi_epi32 (mm_out_a , vmm_zero ), 48 ));
121+ mm_out_hi = _mm256_add_epi16 (mm_out_hi , vmm_half );
122+ mm_out_hi = MM_SHIFTDIV255_epi16 (mm_out_hi );
123+
124+ _mm256_storeu_si256 ((__m256i * ) & out [x ],
125+ _mm256_packus_epi16 (mm_out_lo , mm_out_hi ));
126+ }
127+
128+ #undef MM_SHIFTDIV255_epi16
129+
130+ #endif
131+
132+ #define MM_SHIFTDIV255_epi16 (src )\
133+ _mm_srli_epi16(_mm_add_epi16(src, _mm_srli_epi16(src, 8)), 8)
134+
135+ for (; x < xsize - 3 ; x += 4 ) {
136+ __m128i mm_dst , mm_dst_lo , mm_dst_hi ;
137+ __m128i mm_src , mm_src_hi , mm_src_lo ;
138+ __m128i mm_dst_a , mm_src_a , mm_out_a , mm_blend ;
139+ __m128i mm_coef1 , mm_coef2 , mm_out_lo , mm_out_hi ;
140+
141+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
142+ mm_dst = _mm_loadu_si128 ((__m128i * ) & dst [x ]);
143+ // [16] a1 b1 g1 r1 a0 b0 g0 r0
144+ mm_dst_lo = _mm_unpacklo_epi8 (mm_dst , mm_zero );
145+ // [16] a3 b3 g3 r3 a2 b2 g2 r2
146+ mm_dst_hi = _mm_unpackhi_epi8 (mm_dst , mm_zero );
147+ // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
148+ mm_src = _mm_loadu_si128 ((__m128i * ) & src [x ]);
149+ mm_src_lo = _mm_unpacklo_epi8 (mm_src , mm_zero );
150+ mm_src_hi = _mm_unpackhi_epi8 (mm_src , mm_zero );
151+
152+ // [32] a3 a2 a1 a0
153+ mm_dst_a = _mm_srli_epi32 (mm_dst , 24 );
154+ mm_src_a = _mm_srli_epi32 (mm_src , 24 );
155+
156+ // Compute coefficients
157+ // blend = dst->a * (255 - src->a)
158+ // [16] xx b3 xx b2 xx b1 xx b0
159+ mm_blend = _mm_mullo_epi16 (mm_dst_a , _mm_sub_epi32 (mm_max_alpha , mm_src_a ));
160+ // outa = src->a * 255 + blend
161+ // [16] xx a3 xx a2 xx a1 xx a0
162+ mm_out_a = _mm_add_epi32 (_mm_mullo_epi16 (mm_src_a , mm_max_alpha ), mm_blend );
163+ // coef1 = src->a * 255 * 255 / outa
164+ mm_coef1 = _mm_mullo_epi32 (mm_src_a , mm_max_alpha2 );
165+ // [8] xx xx xx c3 xx xx xx c2 xx xx xx c1 xx xx xx c0
166+ mm_coef1 = _mm_cvtps_epi32 (
167+ _mm_mul_ps (_mm_cvtepi32_ps (mm_coef1 ),
168+ _mm_rcp_ps (_mm_cvtepi32_ps (mm_out_a )))
169+ );
170+ // [8] xx xx xx c3 xx xx xx c2 xx xx xx c1 xx xx xx c0
171+ mm_coef2 = _mm_sub_epi32 (mm_max_alpha , mm_coef1 );
172+
173+ mm_out_lo = _mm_add_epi16 (
174+ _mm_mullo_epi16 (mm_src_lo , _mm_shuffle_epi8 (mm_coef1 , mm_get_lo )),
175+ _mm_mullo_epi16 (mm_dst_lo , _mm_shuffle_epi8 (mm_coef2 , mm_get_lo )));
176+ mm_out_lo = _mm_or_si128 (mm_out_lo , _mm_slli_epi64 (
177+ _mm_unpacklo_epi32 (mm_out_a , mm_zero ), 48 ));
178+ mm_out_lo = _mm_add_epi16 (mm_out_lo , mm_half );
179+ mm_out_lo = MM_SHIFTDIV255_epi16 (mm_out_lo );
180+
181+ mm_out_hi = _mm_add_epi16 (
182+ _mm_mullo_epi16 (mm_src_hi , _mm_shuffle_epi8 (mm_coef1 , mm_get_hi )),
183+ _mm_mullo_epi16 (mm_dst_hi , _mm_shuffle_epi8 (mm_coef2 , mm_get_hi )));
184+ mm_out_hi = _mm_or_si128 (mm_out_hi , _mm_slli_epi64 (
185+ _mm_unpackhi_epi32 (mm_out_a , mm_zero ), 48 ));
186+ mm_out_hi = _mm_add_epi16 (mm_out_hi , mm_half );
187+ mm_out_hi = MM_SHIFTDIV255_epi16 (mm_out_hi );
188+
189+ _mm_storeu_si128 ((__m128i * ) & out [x ],
190+ _mm_packus_epi16 (mm_out_lo , mm_out_hi ));
191+ }
192+
193+ #undef MM_SHIFTDIV255_epi16
194+
195+ for (; x < xsize ; x += 1 ) {
196+ if (src [x ].a == 0 ) {
51197 // Copy 4 bytes at once.
52- * out = * dst ;
198+ out [ x ] = dst [ x ] ;
53199 } else {
54200 // Integer implementation with increased precision.
55201 // Each variable has extra meaningful bits.
56202 // Divisions are rounded.
57203
58204 UINT32 tmpr , tmpg , tmpb ;
59- UINT32 blend = dst -> a * (255 - src -> a );
60- UINT32 outa255 = src -> a * 255 + blend ;
205+ UINT32 blend = dst [ x ]. a * (255 - src [ x ]. a );
206+ UINT32 outa255 = src [ x ]. a * 255 + blend ;
61207 // There we use 7 bits for precision.
62208 // We could use more, but we go beyond 32 bits.
63- UINT32 coef1 = src -> a * 255 * 255 * (1 << PRECISION_BITS ) / outa255 ;
64- UINT32 coef2 = 255 * (1 << PRECISION_BITS ) - coef1 ;
65-
66- tmpr = src -> r * coef1 + dst -> r * coef2 ;
67- tmpg = src -> g * coef1 + dst -> g * coef2 ;
68- tmpb = src -> b * coef1 + dst -> b * coef2 ;
69- out -> r =
70- SHIFTFORDIV255 (tmpr + (0x80 << PRECISION_BITS )) >> PRECISION_BITS ;
71- out -> g =
72- SHIFTFORDIV255 (tmpg + (0x80 << PRECISION_BITS )) >> PRECISION_BITS ;
73- out -> b =
74- SHIFTFORDIV255 (tmpb + (0x80 << PRECISION_BITS )) >> PRECISION_BITS ;
75- out -> a = SHIFTFORDIV255 (outa255 + 0x80 );
76- }
209+ UINT32 coef1 = src [x ].a * 255 * 255 * (1 <<PRECISION_BITS ) / outa255 ;
210+ UINT32 coef2 = 255 * (1 <<PRECISION_BITS ) - coef1 ;
77211
78- dst ++ ;
79- src ++ ;
80- out ++ ;
212+ tmpr = src [x ].r * coef1 + dst [x ].r * coef2 ;
213+ tmpg = src [x ].g * coef1 + dst [x ].g * coef2 ;
214+ tmpb = src [x ].b * coef1 + dst [x ].b * coef2 ;
215+ out [x ].r = SHIFTFORDIV255 (tmpr + (0x80 <<PRECISION_BITS )) >> PRECISION_BITS ;
216+ out [x ].g = SHIFTFORDIV255 (tmpg + (0x80 <<PRECISION_BITS )) >> PRECISION_BITS ;
217+ out [x ].b = SHIFTFORDIV255 (tmpb + (0x80 <<PRECISION_BITS )) >> PRECISION_BITS ;
218+ out [x ].a = SHIFTFORDIV255 (outa255 + 0x80 );
219+ }
81220 }
82221 }
83222
0 commit comments