#include <tmmintrin.h>  // ssse3

//--------------------------------------------------------------------
static inline void nsvg__blend_pix4(unsigned char* p,
                                    unsigned clr,
                                    unsigned char* covers)
{
	__m128i _Z   = _mm_setzero_si128();
	__m128i DC10 = _mm_loadu_si128((__m128i*)p);       // 'A3R3G3B3'A2R2G2B2'A1R1G1B1'A0R0G0B0'
	__m128i SA10 = _mm_loadu_si128((__m128i*)covers);  // 'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'A3'A2'A1'A0'
	SA10         = _mm_unpacklo_epi8(_Z, SA10);        // 'Axoo'Axoo'Axoo'Axoo'A3oo'A2oo'A1oo'A0oo'
	SA10         = _mm_unpacklo_epi16(SA10, SA10);     // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo'
	SA10         = _mm_add_epi16(SA10, _mm_set1_epi16(255));

	// SA' = (SA * (cover + 1)) >> 8;
	SA10         = _mm_mulhi_epu16(_mm_set1_epi16((clr >> 24) & 0xFF), SA10);
	__m128i SA32 = _mm_unpackhi_epi32(SA10, SA10);  // 'A3'A3'A3'A3'A2'A2'A2'A2'
	SA10         = _mm_unpacklo_epi32(SA10, SA10);  // 'A1'A1'A1'A1'A0'A0'A0'A0'

	__m128i SC   = _mm_unpacklo_epi8(_mm_set1_epi32(clr), _Z);  // 'ooA'ooR'ooG'ooB'ooA'ooR'ooG'ooB'

	// DC' = ((SC - DC) * SA' + (DC << 8)) >> 8
	__m128i DC32 = _mm_unpackhi_epi8(DC10, _Z);  // 'ooA3'ooR3'ooG3'ooB3'ooA2'ooR2'ooG2'ooB2'
	__m128i DA   = _mm_and_si128(DC10, _mm_set1_epi32(0xFF000000));  // 'A3oo'oooo'A2oo'oooo'A1oo'oooo'A0oo'oooo'

	DC32 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC32, 8),
			                                _mm_mullo_epi16(_mm_sub_epi16(SC, DC32),
			                                                SA32)),
			                  8);
	DC10 = _mm_unpacklo_epi8(DC10, _Z);  // 'ooA1'ooR1'ooG1'ooB1'ooA0'ooR0'ooG0'ooB0'
	DC10 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC10, 8),
			                                _mm_mullo_epi16(_mm_sub_epi16(SC, DC10),
			                                                SA10)),
			                  8);

	__m128i SA = _mm_slli_epi16(_mm_packs_epi32(SA10, SA32), 8);  // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo'

	// DA' = (SA' + DA) - ((SA' * DA + 255) >> 8)
	DA = _mm_sub_epi16(_mm_add_epi16(DA, SA),
			                _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(DA, SA),
			                                            _mm_set1_epi16(255)),
			                              8));
	DA = _mm_and_si128(_mm_slli_epi16(DA, 8), _mm_set1_epi32(0xFF000000));

	_mm_storeu_si128((__m128i*)p, _mm_add_epi8(_mm_packus_epi16(DC10, DC32), DA));
}
//--------------------------------------------------------------------
static inline void nsvg__blend_pix4(unsigned char* p,
                                    unsigned* clrs,
                                    unsigned char* covers)
{
	__m128i _Z   = _mm_setzero_si128();
	__m128i DC10 = _mm_loadu_si128((__m128i*)p);       // 'A3R3G3B3'A2R2G2B2'A1R1G1B1'A0R0G0B0'
	__m128i SA10 = _mm_loadu_si128((__m128i*)covers);  // 'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'Ax'A3'A2'A1'A0'
	SA10         = _mm_unpacklo_epi8(SA10, _Z);        // 'Axoo'Axoo'Axoo'Axoo'A3oo'A2oo'A1oo'A0oo'
	SA10         = _mm_unpacklo_epi16(SA10, SA10);     // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo'
	SA10         = _mm_add_epi16(SA10, _mm_set1_epi16(1));
	__m128i CA = _mm_and_si128(_mm_loadu_si128((__m128i*)clrs), _mm_set1_epi32(0xFF000000));  // 'A3oooooo'A2oooooo'A1oooooo'A0oooooo'
	CA = _mm_add_epi32(CA, _mm_srli_epi32(CA, 16));  // 'A3ooA3oo'A2ooA2oo'A1ooA1oo'A0ooA0oo'

	// SA' = (SA * (cover + 1)) >> 8;
	SA10         = _mm_mulhi_epu16(CA, SA10);
	__m128i SA32 = _mm_unpackhi_epi32(SA10, SA10);  // 'A3'A3'A3'A3'A2'A2'A2'A2'
	SA10         = _mm_unpacklo_epi32(SA10, SA10);  // 'A1'A1'A1'A1'A0'A0'A0'A0'

	__m128i SC10   = _mm_loadu_si128((__m128i*)clrs);  // 'A3R3G3B3'A2R2G2B2'A1R1G1B1'A0R0G0B0'
	__m128i SC32 = _mm_unpackhi_epi8(SC10, _Z);  // 'ooA3'ooR3'ooG3'ooB3'ooA2'ooR2'ooG2'ooB2'

	// DC' = ((SC - DC) * SA' + (DC << 8)) >> 8
	__m128i DC32 = _mm_unpackhi_epi8(DC10, _Z);  // 'ooA3'ooR3'ooG3'ooB3'ooA2'ooR2'ooG2'ooB2'
	__m128i DA   = _mm_and_si128(DC10, _mm_set1_epi32(0xFF000000));  // 'A3oo'oooo'A2oo'oooo'A1oo'oooo'A0oo'oooo'

	DC32 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC32, 8),
			                                _mm_mullo_epi16(_mm_sub_epi16(SC32, DC32),
			                                                SA32)),
			                  8);
	SC10 = _mm_unpacklo_epi8(SC10, _Z);  // 'ooA1'ooR1'ooG1'ooB1'ooA0'ooR0'ooG0'ooB0'
	DC10 = _mm_unpacklo_epi8(DC10, _Z);  // 'ooA1'ooR1'ooG1'ooB1'ooA0'ooR0'ooG0'ooB0'
	DC10 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(DC10, 8),
			                                _mm_mullo_epi16(_mm_sub_epi16(SC10, DC10),
			                                                SA10)),
			                  8);

	__m128i SA = _mm_slli_epi16(_mm_packs_epi32(SA10, SA32), 8);  // 'A3oo'A3oo'A2oo'A2oo'A1oo'A1oo'A0oo'A0oo'

	// DA' = (SA' + DA) - ((SA' * DA + 255) >> 8)
	DA = _mm_sub_epi16(_mm_add_epi16(DA, SA),
			                _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(DA, SA),
			                                            _mm_set1_epi16(255)),
			                              8));
	DA = _mm_and_si128(_mm_slli_epi16(DA, 8), _mm_set1_epi32(0xFF000000));

	_mm_storeu_si128((__m128i*)p, _mm_add_epi8(_mm_packus_epi16(DC10, DC32), DA));
}
//--------------------------------------------------------------------
static inline void nsvg__blend_pix(unsigned char* p,
                                   unsigned clr,
                                   unsigned cover = 0)
{
	// SA' = (SA * (cover + 1)) >> 8;
	// DC' = ((SC - DC) * SA' + (DC << 8)) >> 8
	// DA' = (SA' + DA) - ((SA' * DA + 255) >> 8)
	int sr = clr & 0xff;
	int sg = (clr >> 8) & 0xff;
	int sb = (clr >> 16) & 0xff;
	int sa = (((clr >> 24) & 0xff) * (cover + 1)) >> 8;
	int r = p[0];
	int g = p[1];
	int b = p[2];
	int a = p[3];
	p[0] = (unsigned char)(((sr - r) * sa + (r << 8)) >> 8);
	p[1] = (unsigned char)(((sg - g) * sa + (g << 8)) >> 8);
	p[2] = (unsigned char)(((sb - b) * sa + (b << 8)) >> 8);
	p[3] = (unsigned char)((sa + a) - ((sa * a + 255) >> 8));
}

//--------------------------------------------------------------------
static void nsvg__scanlineSolid(unsigned char* dst, int count, unsigned char* cover, int x, int y,
                                float tx, float ty, float scale, NSVGcachedPaint* cache)
{

	if (cache->type == NSVG_PAINT_COLOR) {
		int len = (count >> 2);
		while(len--)
		{
			nsvg__blend_pix4(dst, cache->colors[0], cover);
			dst += 16;
			cover += 4;
		}
		len = count - ((count >> 2) << 2);
		while(len--)
		{
			nsvg__blend_pix(dst, cache->colors[0], *cover);
			dst += 4;
			++cover;
		}
	} else if (cache->type == NSVG_PAINT_LINEAR_GRADIENT) {
		// TODO: spread modes.
		float* t = cache->xform;
		float fx = ((float)x - tx) / scale;
		const float fy = ((float)y - ty) / scale * t[3] + t[5];
		const float dx = 1.0f / scale;

		unsigned int clrs[4];
		int len = (count >> 2);
		while(len--)
		{
			for(int i = 0; i < 4; ++i)
			{
				const float gy = fx * t[1] + fy;
				clrs[i] = cache->colors[(int)nsvg__clampf(gy*255.0f, 0, 255.0f)];
				fx += dx;
			}
			nsvg__blend_pix4(dst, clrs, cover);
			dst += 16;
			cover += 4;
		}
		len = count - ((count >> 2) << 2);
		while(len--)
		{
			const float gy = fx * t[1] + fy;
			nsvg__blend_pix(dst, cache->colors[(int)nsvg__clampf(gy*255.0f, 0, 255.0f)], *cover);
			dst += 4;
			++cover;
			fx += dx;
		}
	} else if (cache->type == NSVG_PAINT_RADIAL_GRADIENT) {
		// TODO: spread modes.
		// TODO: focus (fx,fy)
		float* t = cache->xform;
		float fx = ((float)x - tx) / scale;
		const float fy1 = ((float)y - ty) / scale * t[2] + t[4];
		const float fy2 = ((float)y - ty) / scale * t[3] + t[5];
		const float dx = 1.0f / scale;

		unsigned int clrs[4];
		int len = (count >> 2);
		while(len--)
		{
			for(int i = 0; i < 4; ++i)
			{
				const float gx = fx * t[0] + fy1;
				const float gy = fx * t[1] + fy2;
				const float gd = sqrtf(gx*gx + gy*gy);
				clrs[i] = cache->colors[(int)nsvg__clampf(gd*255.0f, 0, 255.0f)];
				fx += dx;
			}
			nsvg__blend_pix4(dst, clrs, cover);
			dst += 16;
			cover += 4;
		}
		len = count - ((count >> 2) << 2);
		while(len--)
		{
			const float gx = fx * t[0] + fy1;
			const float gy = fx * t[1] + fy2;
			const float gd = sqrtf(gx*gx + gy*gy);
			nsvg__blend_pix(dst, cache->colors[(int)nsvg__clampf(gd*255.0f, 0, 255.0f)], *cover);
			dst += 4;
			++cover;
			fx += dx;
		}
		return;
	}
}