@@ -12,7 +12,12 @@
#define SIMD_SSE
#endif
#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
#undef SIMD_SSE
#define SIMD_AVX
#endif
#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
#define SIMD_SSE
#define SIMD_FALLBACK
#include < intrin.h> // __cpuid
@@ -22,10 +27,19 @@
#define SIMD_NEON
#endif
// WebAssembly SIMD implementation requires a few bleeding edge intrinsics that are only available in Chrome Canary
#if defined(__wasm_simd128__) && defined(__wasm_unimplemented_simd128__)
#define SIMD_WASM
#endif
#ifdef SIMD_SSE
#include < tmmintrin.h>
#endif
#ifdef SIMD_AVX
#include < immintrin.h>
#endif
#ifdef SIMD_NEON
#if defined(_MSC_VER) && defined(_M_ARM64)
#include < arm64_neon.h>
@@ -34,6 +48,10 @@
#endif
#endif
#ifdef SIMD_WASM
#include < wasm_simd128.h>
#endif
#ifndef TRACE
#define TRACE 0
#endif
@@ -42,6 +60,14 @@
#include < stdio.h>
#endif
#ifdef SIMD_WASM
#define wasm_v32x4_splat (v, i ) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1 , 4 * i + 2 , 4 * i + 3 , 4 * i, 4 * i + 1 , 4 * i + 2 , 4 * i + 3 , 4 * i, 4 * i + 1 , 4 * i + 2 , 4 * i + 3 , 4 * i, 4 * i + 1 , 4 * i + 2 , 4 * i + 3 )
#define wasm_unpacklo_v8x16 (a, b ) wasm_v8x16_shuffle(a, b, 0 , 16 , 1 , 17 , 2 , 18 , 3 , 19 , 4 , 20 , 5 , 21 , 6 , 22 , 7 , 23 )
#define wasm_unpackhi_v8x16 (a, b ) wasm_v8x16_shuffle(a, b, 8 , 24 , 9 , 25 , 10 , 26 , 11 , 27 , 12 , 28 , 13 , 29 , 14 , 30 , 15 , 31 )
#define wasm_unpacklo_v16x8 (a, b ) wasm_v8x16_shuffle(a, b, 0 , 1 , 16 , 17 , 2 , 3 , 18 , 19 , 4 , 5 , 20 , 21 , 6 , 7 , 22 , 23 )
#define wasm_unpackhi_v16x8 (a, b ) wasm_v8x16_shuffle(a, b, 8 , 9 , 24 , 25 , 10 , 11 , 26 , 27 , 12 , 13 , 28 , 29 , 14 , 15 , 30 , 31 )
#endif
namespace meshopt
{
@@ -267,7 +293,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
return data;
}
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) )
static const unsigned char * decodeBytesGroup (const unsigned char * data, unsigned char * buffer, int bitslog2)
{
#define READ () byte = *data++
@@ -384,7 +410,7 @@ static const unsigned char* decodeVertexBlock(const unsigned char* data, const u
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON)
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
static unsigned char kDecodeBytesGroupShuffle [256 ][8 ];
static unsigned char kDecodeBytesGroupCount [256 ];
@@ -424,27 +450,6 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
return _mm_unpacklo_epi64 (sm0, sm1r);
}
static void transpose8 (__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
{
__m128i t0 = _mm_unpacklo_epi8 (x0, x1);
__m128i t1 = _mm_unpackhi_epi8 (x0, x1);
__m128i t2 = _mm_unpacklo_epi8 (x2, x3);
__m128i t3 = _mm_unpackhi_epi8 (x2, x3);
x0 = _mm_unpacklo_epi16 (t0, t2);
x1 = _mm_unpackhi_epi16 (t0, t2);
x2 = _mm_unpacklo_epi16 (t1, t3);
x3 = _mm_unpackhi_epi16 (t1, t3);
}
static __m128i unzigzag8 (__m128i v)
{
__m128i xl = _mm_sub_epi8 (_mm_setzero_si128 (), _mm_and_si128 (v, _mm_set1_epi8 (1 )));
__m128i xr = _mm_and_si128 (_mm_srli_epi16 (v, 1 ), _mm_set1_epi8 (127 ));
return _mm_xor_si128 (xl, xr);
}
static const unsigned char * decodeBytesGroupSimd (const unsigned char * data, unsigned char * buffer, int bitslog2)
{
switch (bitslog2)
@@ -511,9 +516,66 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
case 3 :
{
__m128i rest = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(data));
__m128i result = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(data));
_mm_storeu_si128 (reinterpret_cast <__m128i*>(buffer), result);
__m128i result = rest;
return data + 16 ;
}
default :
assert (!" Unexpected bit length" ); // unreachable since bitslog2 is a 2-bit value
return data;
}
}
#endif
#ifdef SIMD_AVX
static const __m128i decodeBytesGroupConfig[] = {
_mm_set1_epi8 (3 ),
_mm_set1_epi8 (15 ),
_mm_setr_epi8 (6 , 4 , 2 , 0 , 14 , 12 , 10 , 8 , 22 , 20 , 18 , 16 , 30 , 28 , 26 , 24 ),
_mm_setr_epi8 (4 , 0 , 12 , 8 , 20 , 16 , 28 , 24 , 36 , 32 , 44 , 40 , 52 , 48 , 60 , 56 ),
};
static const unsigned char * decodeBytesGroupSimd (const unsigned char * data, unsigned char * buffer, int bitslog2)
{
switch (bitslog2)
{
case 0 :
{
__m128i result = _mm_setzero_si128 ();
_mm_storeu_si128 (reinterpret_cast <__m128i*>(buffer), result);
return data;
}
case 1 :
case 2 :
{
const unsigned char * skip = data + (bitslog2 << 2 );
__m128i selb = _mm_loadl_epi64 (reinterpret_cast <const __m128i*>(data));
__m128i rest = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(skip));
__m128i sent = decodeBytesGroupConfig[bitslog2 - 1 ];
__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1 ];
__m128i selw = _mm_shuffle_epi32 (selb, 0x44 );
__m128i sel = _mm_and_si128 (sent, _mm_multishift_epi64_epi8 (ctrl, selw));
__mmask16 mask16 = _mm_cmp_epi8_mask (sel, sent, _MM_CMPINT_EQ);
__m128i result = _mm_mask_expand_epi8 (sel, mask16, rest);
_mm_storeu_si128 (reinterpret_cast <__m128i*>(buffer), result);
return skip + _mm_popcnt_u32 (mask16);
}
case 3 :
{
__m128i result = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(data));
_mm_storeu_si128 (reinterpret_cast <__m128i*>(buffer), result);
@@ -561,28 +623,6 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m
#endif
}
static void transpose8 (uint8x16_t & x0, uint8x16_t & x1, uint8x16_t & x2, uint8x16_t & x3)
{
uint8x16x2_t t01 = vzipq_u8 (x0, x1);
uint8x16x2_t t23 = vzipq_u8 (x2, x3);
uint16x8x2_t x01 = vzipq_u16 (vreinterpretq_u16_u8 (t01.val [0 ]), vreinterpretq_u16_u8 (t23.val [0 ]));
uint16x8x2_t x23 = vzipq_u16 (vreinterpretq_u16_u8 (t01.val [1 ]), vreinterpretq_u16_u8 (t23.val [1 ]));
x0 = vreinterpretq_u8_u16 (x01.val [0 ]);
x1 = vreinterpretq_u8_u16 (x01.val [1 ]);
x2 = vreinterpretq_u8_u16 (x23.val [0 ]);
x3 = vreinterpretq_u8_u16 (x23.val [1 ]);
}
static uint8x16_t unzigzag8 (uint8x16_t v)
{
uint8x16_t xl = vreinterpretq_u8_s8 (vnegq_s8 (vreinterpretq_s8_u8 (vandq_u8 (v, vdupq_n_u8 (1 )))));
uint8x16_t xr = vshrq_n_u8 (v, 1 );
return veorq_u8 (xl, xr);
}
static const unsigned char * decodeBytesGroupSimd (const unsigned char * data, unsigned char * buffer, int bitslog2)
{
switch (bitslog2)
@@ -639,9 +679,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
case 3 :
{
uint8x16_t rest = vld1q_u8 (data);
uint8x16_t result = rest;
uint8x16_t result = vld1q_u8 (data);
vst1q_u8 (buffer, result);
@@ -655,7 +693,205 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON)
#ifdef SIMD_WASM
static v128_t decodeShuffleMask (unsigned char mask0, unsigned char mask1)
{
// TODO: 8b buffer overrun - should we use splat or extend buffers?
v128_t sm0 = wasm_v128_load (&kDecodeBytesGroupShuffle [mask0]);
v128_t sm1 = wasm_v128_load (&kDecodeBytesGroupShuffle [mask1]);
// TODO: we should use v8x16_load_splat
v128_t sm1off = wasm_v128_load (&kDecodeBytesGroupCount [mask0]);
sm1off = wasm_v8x16_shuffle (sm1off, sm1off, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
v128_t sm1r = wasm_i8x16_add (sm1, sm1off);
return wasm_v8x16_shuffle (sm0, sm1r, 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 );
}
static void wasmMoveMask (v128_t mask, unsigned char & mask0, unsigned char & mask1)
{
uint64_t mbits = 0x8040201008040201ull ;
uint64_t m0_8 = wasm_i64x2_extract_lane (mask, 0 ) & mbits;
uint64_t m1_8 = wasm_i64x2_extract_lane (mask, 1 ) & mbits;
uint32_t m0_4 = m0_8 | (m0_8 >> 32 );
uint32_t m1_4 = m1_8 | (m1_8 >> 32 );
uint16_t m0_2 = m0_4 | (m0_4 >> 16 );
uint16_t m1_2 = m1_4 | (m1_4 >> 16 );
mask0 = m0_2 | (m0_2 >> 8 );
mask1 = m1_2 | (m1_2 >> 8 );
}
static const unsigned char * decodeBytesGroupSimd (const unsigned char * data, unsigned char * buffer, int bitslog2)
{
unsigned char byte, enc, encv;
const unsigned char * data_var;
switch (bitslog2)
{
case 0 :
{
v128_t result = wasm_i8x16_splat (0 );
wasm_v128_store (buffer, result);
return data;
}
case 1 :
{
// TODO: test 4b load splat
v128_t sel2 = wasm_v128_load (data);
v128_t rest = wasm_v128_load (data + 4 );
v128_t sel22 = wasm_unpacklo_v8x16 (wasm_i16x8_shr (sel2, 4 ), sel2);
v128_t sel2222 = wasm_unpacklo_v8x16 (wasm_i16x8_shr (sel22, 2 ), sel22);
v128_t sel = wasm_v128_and (sel2222, wasm_i8x16_splat (3 ));
v128_t mask = wasm_i8x16_eq (sel, wasm_i8x16_splat (3 ));
if (!wasm_i8x16_any_true (mask))
{
wasm_v128_store (buffer, sel);
return data + 4 ;
}
unsigned char mask0, mask1;
wasmMoveMask (mask, mask0, mask1);
v128_t shuf = decodeShuffleMask (mask0, mask1);
// TODO: test or/andnot
v128_t result = wasm_v128_bitselect (wasm_v8x16_swizzle (rest, shuf), sel, mask);
wasm_v128_store (buffer, result);
return data + 4 + kDecodeBytesGroupCount [mask0] + kDecodeBytesGroupCount [mask1];
}
case 2 :
{
// TODO: test 8b load splat
v128_t sel4 = wasm_v128_load (data);
v128_t rest = wasm_v128_load (data + 8 );
v128_t sel44 = wasm_unpacklo_v8x16 (wasm_i16x8_shr (sel4, 4 ), sel4);
v128_t sel = wasm_v128_and (sel44, wasm_i8x16_splat (15 ));
v128_t mask = wasm_i8x16_eq (sel, wasm_i8x16_splat (15 ));
if (!wasm_i8x16_any_true (mask))
{
wasm_v128_store (buffer, sel);
return data + 8 ;
}
unsigned char mask0, mask1;
wasmMoveMask (mask, mask0, mask1);
v128_t shuf = decodeShuffleMask (mask0, mask1);
// TODO: test or/andnot
v128_t result = wasm_v128_bitselect (wasm_v8x16_swizzle (rest, shuf), sel, mask);
wasm_v128_store (buffer, result);
return data + 8 + kDecodeBytesGroupCount [mask0] + kDecodeBytesGroupCount [mask1];
}
case 3 :
{
v128_t result = wasm_v128_load (data);
wasm_v128_store (buffer, result);
return data + 16 ;
}
default :
assert (!" Unexpected bit length" ); // unreachable since bitslog2 is a 2-bit value
return data;
}
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX)
static void transpose8 (__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
{
__m128i t0 = _mm_unpacklo_epi8 (x0, x1);
__m128i t1 = _mm_unpackhi_epi8 (x0, x1);
__m128i t2 = _mm_unpacklo_epi8 (x2, x3);
__m128i t3 = _mm_unpackhi_epi8 (x2, x3);
x0 = _mm_unpacklo_epi16 (t0, t2);
x1 = _mm_unpackhi_epi16 (t0, t2);
x2 = _mm_unpacklo_epi16 (t1, t3);
x3 = _mm_unpackhi_epi16 (t1, t3);
}
static __m128i unzigzag8 (__m128i v)
{
__m128i xl = _mm_sub_epi8 (_mm_setzero_si128 (), _mm_and_si128 (v, _mm_set1_epi8 (1 )));
__m128i xr = _mm_and_si128 (_mm_srli_epi16 (v, 1 ), _mm_set1_epi8 (127 ));
return _mm_xor_si128 (xl, xr);
}
#endif
#ifdef SIMD_NEON
static void transpose8 (uint8x16_t & x0, uint8x16_t & x1, uint8x16_t & x2, uint8x16_t & x3)
{
uint8x16x2_t t01 = vzipq_u8 (x0, x1);
uint8x16x2_t t23 = vzipq_u8 (x2, x3);
uint16x8x2_t x01 = vzipq_u16 (vreinterpretq_u16_u8 (t01.val [0 ]), vreinterpretq_u16_u8 (t23.val [0 ]));
uint16x8x2_t x23 = vzipq_u16 (vreinterpretq_u16_u8 (t01.val [1 ]), vreinterpretq_u16_u8 (t23.val [1 ]));
x0 = vreinterpretq_u8_u16 (x01.val [0 ]);
x1 = vreinterpretq_u8_u16 (x01.val [1 ]);
x2 = vreinterpretq_u8_u16 (x23.val [0 ]);
x3 = vreinterpretq_u8_u16 (x23.val [1 ]);
}
static uint8x16_t unzigzag8 (uint8x16_t v)
{
uint8x16_t xl = vreinterpretq_u8_s8 (vnegq_s8 (vreinterpretq_s8_u8 (vandq_u8 (v, vdupq_n_u8 (1 )))));
uint8x16_t xr = vshrq_n_u8 (v, 1 );
return veorq_u8 (xl, xr);
}
#endif
#ifdef SIMD_WASM
static void transpose8 (v128_t & x0, v128_t & x1, v128_t & x2, v128_t & x3)
{
v128_t t0 = wasm_unpacklo_v8x16 (x0, x1);
v128_t t1 = wasm_unpackhi_v8x16 (x0, x1);
v128_t t2 = wasm_unpacklo_v8x16 (x2, x3);
v128_t t3 = wasm_unpackhi_v8x16 (x2, x3);
x0 = wasm_unpacklo_v16x8 (t0, t2);
x1 = wasm_unpackhi_v16x8 (t0, t2);
x2 = wasm_unpacklo_v16x8 (t1, t3);
x3 = wasm_unpackhi_v16x8 (t1, t3);
}
static v128_t unzigzag8 (v128_t v)
{
v128_t xl = wasm_i8x16_neg (wasm_v128_and (v, wasm_i8x16_splat (1 )));
v128_t xr = wasm_u8x16_shr (v, 1 );
return wasm_v128_xor (xl, xr);
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
static const unsigned char * decodeBytesSimd (const unsigned char * data, const unsigned char * data_end, unsigned char * buffer, size_t buffer_size)
{
assert (buffer_size % kByteGroupSize == 0 );
@@ -719,7 +955,7 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
return 0 ;
}
#ifdef SIMD_SSE
#if defined( SIMD_SSE) || defined(SIMD_AVX)
#define TEMP __m128i
#define PREP () __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast <const int *>(last_vertex + k))
#define LOAD (i ) __m128i r##i = _mm_loadu_si128(reinterpret_cast <const __m128i*>(buffer + j + i * vertex_count_aligned))
@@ -735,6 +971,15 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
#define GRP4 (i ) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1 )), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1 ))
#define FIXD (i ) t##i = pi = vadd_u8(pi , t##i)
#define SAVE (i ) vst1_lane_u32(reinterpret_cast <uint32_t *>(savep), vreinterpret_u32_u8(t##i), 0 ), savep += vertex_size
#endif
#ifdef SIMD_WASM
#define TEMP v128_t
#define PREP () v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun
#define LOAD (i ) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
#define GRP4 (i ) t0 = wasm_v32x4_splat(r##i, 0 ), t1 = wasm_v32x4_splat(r##i, 1 ), t2 = wasm_v32x4_splat(r##i, 2 ), t3 = wasm_v32x4_splat(r##i, 3 )
#define FIXD (i ) t##i = pi = wasm_i8x16_add(pi , t##i)
#define SAVE (i ) *reinterpret_cast <int *>(savep) = wasm_i32x4_extract_lane(t##i, 0 ), savep += vertex_size
#endif
PREP ();
@@ -906,13 +1151,19 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
int cpuinfo[4 ] = {};
__cpuid (cpuinfo, 1 );
decode = (cpuinfo[2 ] & (1 << 9 )) ? decodeVertexBlockSimd : decodeVertexBlock;
#elif defined(SIMD_SSE) || defined(SIMD_NEON)
#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined( SIMD_NEON) || defined(SIMD_WASM )
decode = decodeVertexBlockSimd;
#else
decode = decodeVertexBlock;
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON)
#if defined(SIMD_WASM)
// TODO: workaround for https://github.com/emscripten-core/emscripten/issues/9767
if (!gDecodeBytesGroupInitialized )
gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables ();
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
assert (gDecodeBytesGroupInitialized );
(void )gDecodeBytesGroupInitialized ;
#endif