#include // ssse3 //-------------------------------------------------------------------- static inline void nsvg__blend_pix4(unsigned char* p, unsigned clr, unsigned char* covers) { __m128i _Z = _mm_setzero_si128(); __m128i DC10 = _mm_loadu_si128((__m128i*)p); // 'A3R3G3B3'A2R2G2B2'A1R1G1B1'A0R0G0B0' __m128i SA10 = _mm_loadu_si128((__m128i*)covers); // 'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'A3'A2'A1'A0' SA10 = _mm_unpacklo_epi8(_Z, SA10); // 'Axoo'Axoo'Axoo'Axoo'A3oo'A2oo'A1oo'A0oo' SA10 = _mm_unpacklo_epi16(SA10, SA10); // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo' SA10 = _mm_add_epi16(SA10, _mm_set1_epi16(255)); // SA' = (SA * (cover + 1)) >> 8; SA10 = _mm_mulhi_epu16(_mm_set1_epi16((clr >> 24) & 0xFF), SA10); __m128i SA32 = _mm_unpackhi_epi32(SA10, SA10); // 'A3'A3'A3'A3'A2'A2'A2'A2' SA10 = _mm_unpacklo_epi32(SA10, SA10); // 'A1'A1'A1'A1'A0'A0'A0'A0' __m128i SC = _mm_unpacklo_epi8(_mm_set1_epi32(clr), _Z); // 'ooA'ooR'ooG'ooB'ooA'ooR'ooG'ooB' // DC' = ((SC - DC) * SA' + (DC << 8)) >> 8 __m128i DC32 = _mm_unpackhi_epi8(DC10, _Z); // 'ooA3'ooR3'ooG3'ooB3'ooA2'ooR2'ooG2'ooB2' __m128i DA = _mm_and_si128(DC10, _mm_set1_epi32(0xFF000000)); // 'A3oo'oooo'A2oo'oooo'A1oo'oooo'A0oo'oooo' DC32 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC32, 8), _mm_mullo_epi16(_mm_sub_epi16(SC, DC32), SA32)), 8); DC10 = _mm_unpacklo_epi8(DC10, _Z); // 'ooA1'ooR1'ooG1'ooB1'ooA0'ooR0'ooG0'ooB0' DC10 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC10, 8), _mm_mullo_epi16(_mm_sub_epi16(SC, DC10), SA10)), 8); __m128i SA = _mm_slli_epi16(_mm_packs_epi32(SA10, SA32), 8); // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo' // DA' = (SA' + DA) - ((SA' * DA + 255) >> 8) DA = _mm_sub_epi16(_mm_add_epi16(DA, SA), _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(DA, SA), _mm_set1_epi16(255)), 8)); DA = _mm_and_si128(_mm_slli_epi16(DA, 8), _mm_set1_epi32(0xFF000000)); _mm_storeu_si128((__m128i*)p, _mm_add_epi8(_mm_packus_epi16(DC10, DC32), DA)); } //-------------------------------------------------------------------- static inline void nsvg__blend_pix4(unsigned char* p, unsigned* clrs, unsigned char* covers) { __m128i _Z = _mm_setzero_si128(); __m128i DC10 = _mm_loadu_si128((__m128i*)p); // 'A3R3G3B3'A2R2G2B2'A1R1G1B1'A0R0G0B0' __m128i SA10 = _mm_loadu_si128((__m128i*)covers); // 'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'A3'A2'A1'A0' SA10 = _mm_unpacklo_epi8(SA10, _Z); // 'Axoo'Axoo'Axoo'Axoo'A3oo'A2oo'A1oo'A0oo' SA10 = _mm_unpacklo_epi16(SA10, SA10); // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo' SA10 = _mm_add_epi16(SA10, _mm_set1_epi16(1)); __m128i CA = _mm_and_si128(_mm_loadu_si128((__m128i*)clrs), _mm_set1_epi32(0xFF000000)); // 'A3oooooo'A2oooooo'A1oooooo'A0oooooo' CA = _mm_add_epi32(CA, _mm_srli_epi32(CA, 16)); // 'A3ooA3oo'A2ooA2oo'A1ooA1oo'A0ooA0oo' // SA' = (SA * (cover + 1)) >> 8; SA10 = _mm_mulhi_epu16(CA, SA10); __m128i SA32 = _mm_unpackhi_epi32(SA10, SA10); // 'A3'A3'A3'A3'A2'A2'A2'A2' SA10 = _mm_unpacklo_epi32(SA10, SA10); // 'A1'A1'A1'A1'A0'A0'A0'A0' __m128i SC10 = _mm_loadu_si128((__m128i*)clrs); // 'A3R3G3B3'A2R2G2B2'A1R1G1B1'A0R0G0B0' __m128i SC32 = _mm_unpackhi_epi8(SC10, _Z); // 'ooA3'ooR3'ooG3'ooB3'ooA2'ooR2'ooG2'ooB2' // DC' = ((SC - DC) * SA' + (DC << 8)) >> 8 __m128i DC32 = _mm_unpackhi_epi8(DC10, _Z); // 'ooA3'ooR3'ooG3'ooB3'ooA2'ooR2'ooG2'ooB2' __m128i DA = _mm_and_si128(DC10, _mm_set1_epi32(0xFF000000)); // 'A3oo'oooo'A2oo'oooo'A1oo'oooo'A0oo'oooo' DC32 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC32, 8), _mm_mullo_epi16(_mm_sub_epi16(SC32, DC32), SA32)), 8); SC10 = _mm_unpacklo_epi8(SC10, _Z); // 'ooA1'ooR1'ooG1'ooB1'ooA0'ooR0'ooG0'ooB0' DC10 = _mm_unpacklo_epi8(DC10, _Z); // 'ooA1'ooR1'ooG1'ooB1'ooA0'ooR0'ooG0'ooB0' DC10 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC10, 8), _mm_mullo_epi16(_mm_sub_epi16(SC10, DC10), SA10)), 8); __m128i SA = _mm_slli_epi16(_mm_packs_epi32(SA10, SA32), 8); // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo' // DA' = (SA' + DA) - ((SA' * DA + 255) >> 8) DA = _mm_sub_epi16(_mm_add_epi16(DA, SA), _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(DA, SA), _mm_set1_epi16(255)), 8)); DA = _mm_and_si128(_mm_slli_epi16(DA, 8), _mm_set1_epi32(0xFF000000)); _mm_storeu_si128((__m128i*)p, _mm_add_epi8(_mm_packus_epi16(DC10, DC32), DA)); } //-------------------------------------------------------------------- static inline void nsvg__blend_pix(unsigned char* p, unsigned clr, unsigned cover = 0) { // SA' = (SA * (cover + 1)) >> 8; // DC' = ((SC - DC) * SA' + (DC << 8)) >> 8 // DA' = (SA' + DA) - ((SA' * DA + 255) >> 8) int sr = clr & 0xff; int sg = (clr >> 8) & 0xff; int sb = (clr >> 16) & 0xff; int sa = (((clr >> 24) & 0xff) * (cover + 1)) >> 8; int r = p[0]; int g = p[1]; int b = p[2]; int a = p[3]; p[0] = (unsigned char)(((sr - r) * sa + (r << 8)) >> 8); p[1] = (unsigned char)(((sg - g) * sa + (g << 8)) >> 8); p[2] = (unsigned char)(((sb - b) * sa + (b << 8)) >> 8); p[3] = (unsigned char)((sa + a) - ((sa * a + 255) >> 8)); } //-------------------------------------------------------------------- static void nsvg__scanlineSolid(unsigned char* dst, int count, unsigned char* cover, int x, int y, float tx, float ty, float scale, NSVGcachedPaint* cache) { if (cache->type == NSVG_PAINT_COLOR) { int len = (count >> 2); while(len--) { nsvg__blend_pix4(dst, cache->colors[0], cover); dst += 16; cover += 4; } len = count - ((count >> 2) << 2); while(len--) { nsvg__blend_pix(dst, cache->colors[0], *cover); dst += 4; ++cover; } } else if (cache->type == NSVG_PAINT_LINEAR_GRADIENT) { // TODO: spread modes. float* t = cache->xform; float fx = ((float)x - tx) / scale; const float fy = ((float)y - ty) / scale * t[3] + t[5]; const float dx = 1.0f / scale; unsigned int clrs[4]; int len = (count >> 2); while(len--) { for(int i = 0; i < 4; ++i) { const float gy = fx * t[1] + fy; clrs[i] = cache->colors[(int)nsvg__clampf(gy*255.0f, 0, 255.0f)]; fx += dx; } nsvg__blend_pix4(dst, clrs, cover); dst += 16; cover += 4; } len = count - ((count >> 2) << 2); while(len--) { const float gy = fx * t[1] + fy; nsvg__blend_pix(dst, cache->colors[(int)nsvg__clampf(gy*255.0f, 0, 255.0f)], *cover); dst += 4; ++cover; fx += dx; } } else if (cache->type == NSVG_PAINT_RADIAL_GRADIENT) { // TODO: spread modes. // TODO: focus (fx,fy) float* t = cache->xform; float fx = ((float)x - tx) / scale; const float fy1 = ((float)y - ty) / scale * t[2] + t[4]; const float fy2 = ((float)y - ty) / scale * t[3] + t[5]; const float dx = 1.0f / scale; unsigned int clrs[4]; int len = (count >> 2); while(len--) { for(int i = 0; i < 4; ++i) { const float gx = fx * t[0] + fy1; const float gy = fx * t[1] + fy2; const float gd = sqrtf(gx*gx + gy*gy); clrs[i] = cache->colors[(int)nsvg__clampf(gd*255.0f, 0, 255.0f)]; fx += dx; } nsvg__blend_pix4(dst, clrs, cover); dst += 16; cover += 4; } len = count - ((count >> 2) << 2); while(len--) { const float gx = fx * t[0] + fy1; const float gy = fx * t[1] + fy2; const float gd = sqrtf(gx*gx + gy*gy); nsvg__blend_pix(dst, cache->colors[(int)nsvg__clampf(gd*255.0f, 0, 255.0f)], *cover); dst += 4; ++cover; fx += dx; } return; } }