Skip to content
Permalink
Browse files

Ice Lake 向けの 8SymPack のテスト実装を追加した。

  • Loading branch information
umezawatakeshi committed Jan 27, 2020
1 parent 815e6cd commit 9821315a23e4b569dcbc72859aaadf87a92a387d
@@ -105,3 +105,31 @@ static inline __m256i _mm256_set16_epi8(char e15, char e14, char e13, char e12,
}

#endif

#if defined(__AVX512F__)

static inline __m512i _mm512_set8_epi8(char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
{
return _mm512_set_epi8(
e7, e6, e5, e4, e3, e2, e1, e0,
e7, e6, e5, e4, e3, e2, e1, e0,
e7, e6, e5, e4, e3, e2, e1, e0,
e7, e6, e5, e4, e3, e2, e1, e0,
e7, e6, e5, e4, e3, e2, e1, e0,
e7, e6, e5, e4, e3, e2, e1, e0,
e7, e6, e5, e4, e3, e2, e1, e0,
e7, e6, e5, e4, e3, e2, e1, e0
);
}

static inline __m512i _mm512_set16_epi8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
{
return _mm512_set_epi8(
e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
);
}

#endif
@@ -1,6 +1,8 @@
/* 文字コードはSJIS 改行コードはCRLF */
/* $Id$ */

#pragma once

/*
* std::array や std::pair や std::tuple を使うと、
* 一部のメンバ関数がインライン展開されずに普通の関数として実体化される。
@@ -2,11 +2,14 @@
/* $Id$ */

#include <myintrin_x86x64.h>
#include "POD.h"

#if !defined(GENERATE_SSE2) && !defined(GENERATE_SSSE3) && !defined(GENERATE_SSE41) && !defined(GENERATE_AVX1) && !defined(GENERATE_AVX2)
#if !defined(GENERATE_SSE2) && !defined(GENERATE_SSSE3) && !defined(GENERATE_SSE41) && !defined(GENERATE_AVX1) && !defined(GENERATE_AVX2) && !defined(GENERATE_AVX512_ICL)
#error
#endif

extern void* enabler;

template<int F>
static inline void IncrementCounters8(__m128i xmm, uint32_t* pCountTable)
{
@@ -119,6 +122,23 @@ static inline FORCEINLINE __m256i tuned_PredictLeft8Element(__m256i prev, __m256
return residual;
}

template<int F, typename std::enable_if<F == CODEFEATURE_AVX512_ICL>::type*& = enabler>
static inline FORCEINLINE __m512i tuned_PredictLeft8Element(__m512i prev, __m512i value)
{
__m512i left = _mm512_permutex2var_epi8(prev, _mm512_set_epi8(
126, 125, 124, 123, 122, 121, 120, 119,
118, 117, 116, 115, 114, 113, 112, 111,
110, 109, 108, 107, 106, 105, 104, 103,
102, 101, 100, 99, 98, 97, 96, 95,
94, 93, 92, 91, 90, 89, 88, 87,
86, 85, 84, 83, 82, 81, 80, 79,
78, 77, 76, 75, 74, 73, 72, 71,
70, 69, 68, 67, 66, 65, 64, 63
), value); // prev はこの後は使われないので、VPERMT2B で prev の方が dst になって上書きされることを期待している。
__m512i residual = _mm512_sub_epi8(value, left);
return residual;
}

template<int F, bool DoCount = true, typename std::enable_if<F < CODEFEATURE_AVX2>::type*& = enabler>
static inline FORCEINLINE __m128i tuned_PredictLeftAndCount8Element(__m128i prev, __m128i value, uint32_t* pCountTable)
{
@@ -188,6 +208,33 @@ static inline FORCEINLINE VECTOR2<__m256i> /* value0, nextprev */ tuned_RestoreL
return { s0, prev };
}

template<int F, typename std::enable_if<F == CODEFEATURE_AVX512_ICL>::type*& = enabler>
static inline FORCEINLINE VECTOR2<__m512i> /* value0, nextprev */ tuned_RestoreLeft8Element(__m512i prev, __m512i s0)
{
s0 = _mm512_add_epi8(s0, _mm512_bslli_epi128(s0, 1));
s0 = _mm512_add_epi8(s0, _mm512_bslli_epi128(s0, 2));
s0 = _mm512_add_epi8(s0, _mm512_bslli_epi128(s0, 4));
s0 = _mm512_add_epi8(s0, _mm512_bslli_epi128(s0, 8));
__m512i stmp;
stmp = _mm512_add_epi8(s0, _mm512_permutexvar_epi8(_mm512_set_epi8(
47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
), s0));
s0 = _mm512_mask_mov_epi64(s0, 0xcc, stmp);
stmp = _mm512_add_epi8(s0, _mm512_permutexvar_epi8(_mm512_set_epi8(
31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
), stmp));
s0 = _mm512_mask_mov_epi64(s0, 0xf0, stmp);
s0 = _mm512_add_epi8(s0, prev);
prev = _mm512_permutexvar_epi8(_mm512_set1_epi8(63), s0);
return { s0, prev };
}

template<int F>
static inline FORCEINLINE VECTOR3<__m128i> /* value0, value1, nextprev */ tuned_RestoreLeft8Element(__m128i prev, __m128i s0, __m128i s1)
{
@@ -0,0 +1,164 @@
/* 文字コードはSJIS 改行コードはCRLF */
/* $Id$ */

#include <myintrin_x86x64.h>
#include "SymPack_x86x64.h"

#if !defined(GENERATE_AVX512_ICL)
#error
#endif

// 正しく動くことは SDE で確認してある。(実機がないので速度は不明)

template<int F>
static inline FORCEINLINE void PackForIntra(uint8_t*& q, uint8_t*& r, __m512i w)
{
__mmask8 knotzero = _mm512_cmpneq_epi64_mask(w, _mm512_set1_epi8(0));
__mmask64 knegative = _mm512_cmplt_epi8_mask(w, _mm512_set1_epi8(0));
__m512i notw = _mm512_xor_si512(w, _mm512_set1_epi8(-1));
__m512i z = _mm512_mask_mov_epi8(w, knegative, notw);

z = _mm512_or_si512(z, _mm512_slli_epi64(z, 32));
z = _mm512_or_si512(z, _mm512_slli_epi64(z, 16));
z = _mm512_or_si512(_mm512_or_si512(z, _mm512_set1_epi64(1ULL << 56)), _mm512_slli_epi64(z, 8));
__m512i lz = _mm512_lzcnt_epi64(z);

__m512i rembits = _mm512_sub_epi64(lz, _mm512_set1_epi64(1));
rembits = _mm512_mask_mov_epi64(_mm512_set1_epi64(8), knotzero, rembits);

__m128i rembits64 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_set8_epi8(56, 48, 40, 32, 24, 16, 8, 0), rembits));
__m128i vmodes = _mm_subs_epu8(_mm_set1_epi8(7), rembits64);
uint32_t modes = (uint32_t)_pext_u64(_mm_cvtsi128_si64(vmodes), 0x0707070707070707ULL);

__m128i vmask = _mm_shuffle_epi8(_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, (char)0xff), rembits64);
uint64_t mask = _mm_cvtsi128_si64(vmask);
__mmask64 kmask = mask;
// ↑VPSHUFBITQMB を使うと1命令少なくなるが、その場合でも後で POPCNT に渡すために KMOV が現れるので意味がない

w = _mm512_add_epi8(w, _mm512_srlv_epi64(_mm512_set1_epi8((char)0x80), rembits));
w = _mm512_or_si512(_mm512_and_si512(w, _mm512_set1_epi16(0x00ff)), _mm512_srlv_epi64(_mm512_andnot_si512(_mm512_set1_epi16(0x00ff), w), rembits));
rembits = _mm512_slli_epi64(rembits, 1);
w = _mm512_or_si512(_mm512_and_si512(w, _mm512_set1_epi32(0x0000ffff)), _mm512_srlv_epi64(_mm512_andnot_si512(_mm512_set1_epi32(0x0000ffff), w), rembits));
rembits = _mm512_slli_epi64(rembits, 1);
w = _mm512_or_si512(_mm512_and_si512(w, _mm512_set1_epi64(0x00000000ffffffffULL)), _mm512_srlv_epi64(_mm512_andnot_si512(_mm512_set1_epi64(0x00000000ffffffffULL), w), rembits));

_mm512_mask_compressstoreu_epi8(q, kmask, w);
q += _mm_popcnt_u64(mask);
*(uint32_t*)r = modes;
r += 3;
}

template<>
void tuned_Pack8SymAfterPredictPlanarGradient8<CODEFEATURE_AVX512_ICL>(uint8_t *pPacked, size_t *cbPacked, uint8_t *pControl, const uint8_t *pSrcBegin, const uint8_t *pSrcEnd, size_t cbStride)
{
constexpr int F = CODEFEATURE_AVX512_ICL;

auto q = pPacked;
auto r = pControl;
memset(pControl, 0, (pSrcEnd - pSrcBegin) / 64 * 3);

{
__m512i prev = _mm512_set1_epi8((char)0x80);

for (auto p = pSrcBegin; p != pSrcBegin + cbStride; p += 64)
{
__m512i value = _mm512_loadu_si512((const __m512i *)p);
__m512i residual = tuned_PredictLeft8Element<F>(prev, value);
prev = value;

PackForIntra<CODEFEATURE_AVX512_ICL>(q, r, residual);
}
}

for (auto pp = pSrcBegin + cbStride; pp != pSrcEnd; pp += cbStride)
{
__m512i prev = _mm512_setzero_si512();

for (auto p = pp; p != pp + cbStride; p += 64)
{
__m512i value = _mm512_sub_epi8(_mm512_loadu_si512((const __m512i *)p), _mm512_loadu_si512((const __m512i *)(p - cbStride)));
__m512i residual = tuned_PredictLeft8Element<F>(prev, value);
prev = value;

PackForIntra<CODEFEATURE_AVX512_ICL>(q, r, residual);
}
}

*cbPacked = q - pPacked;
}


template<int F>
static inline FORCEINLINE __m512i UnpackForIntra(const uint8_t*& q, const uint8_t *& r)
{
uint32_t modes = *(uint32_t*)r;
r += 3;

__m128i vmodes = _mm_cvtsi64_si128(_pdep_u64(modes, 0x0707070707070707ULL));
__m128i vmask = _mm_shuffle_epi8(_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, (char)0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x00), vmodes);
__mmask64 kmask = _mm_cvtsi128_si64(vmask);
__m512i w = _mm512_maskz_expandloadu_epi8(kmask, q);
q += _mm_popcnt_u64(kmask);

// VPMULTISHIFTQB を使う場合
// AVX2 までと同様に VPSLL(V)Q を使った場合より速いかどうかは不明
__m512i vmodes512 = _mm512_permutexvar_epi8(_mm512_set_epi8(
7, 7, 7, 7, 7, 7, 7, 7,
6, 6, 6, 6, 6, 6, 6, 6,
5, 5, 5, 5, 5, 5, 5, 5,
4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0
), _mm512_castsi128_si512(vmodes)); // VPERMQ (_mm512_permutevar_epi64) はインデックスの上位にゴミがあっても動作は変わらないので、下のpshufbにも直接渡せるように同じバイトを複製したものを使ってよい
__m512i vshifts = _mm512_permutexvar_epi64(vmodes512, _mm512_set_epi8(
56, 48, 40, 32, 24, 16, 8, 0,
49, 42, 35, 28, 21, 14, 7, 0,
42, 36, 30, 24, 18, 12, 6, 0,
35, 30, 25, 20, 15, 10, 5, 0,
28, 24, 20, 16, 12, 8, 4, 0,
21, 18, 15, 12, 9, 6, 3, 0,
14, 12, 10, 8, 6, 4, 2, 0,
7, 6, 5, 4, 3, 2, 1, 0
));
w = _mm512_multishift_epi64_epi8(vshifts, w);
w = _mm512_and_si512(w, _mm512_shuffle_epi8(_mm512_set16_epi8(0, 0, 0, 0, 0, 0, 0, 0, (char)0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x00), vmodes512));
w = _mm512_sub_epi8(w, _mm512_shuffle_epi8(_mm512_set16_epi8(0, 0, 0, 0, 0, 0, 0, 0, (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x00), vmodes512));

return w;
}

template<>
void tuned_Unpack8SymAndRestorePlanarGradient8<CODEFEATURE_AVX512_ICL>(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8_t *pPacked, const uint8_t *pControl, size_t cbStride)
{
constexpr int F = CODEFEATURE_AVX512_ICL;

auto q = pPacked;
auto r = pControl;

{
__m512i prev = _mm512_set1_epi8((char)0x80);

for (auto p = pDstBegin; p != pDstBegin + cbStride; p += 64)
{
__m512i s0 = UnpackForIntra<F>(q, r);
auto result = tuned_RestoreLeft8Element<F>(prev, s0);
_mm512_storeu_si512((__m256i *)p, result.v0);
prev = result.v1;
}
}

for (auto pp = pDstBegin + cbStride; pp != pDstEnd; pp += cbStride)
{
__m512i prev = _mm512_set1_epi8((char)0);

for (auto p = pp; p != pp + cbStride; p += 64)
{
__m512i s0 = UnpackForIntra<F>(q, r);
auto result = tuned_RestoreLeft8Element<F>(prev, s0);
_mm512_storeu_si512((__m512i*)p, _mm512_add_epi8(result.v0, _mm512_loadu_si512((const __m512i*)(p - cbStride))));
prev = result.v1;
}
}
}
@@ -518,6 +518,15 @@ extern const TUNEDFUNC_SYMPACK tfnSymPackAVX2 = {
tuned_Unpack8SymWithDiff8<CODEFEATURE_AVX2>,
};

extern const TUNEDFUNC_SYMPACK tfnSymPackAVX512ICL = {
&tfnSymPackAVX2,
{ FEATURE0_AVX512F | FEATURE0_AVX512_VBMI2, FEATURE1_BMI2 },
tuned_Pack8SymAfterPredictPlanarGradient8<CODEFEATURE_AVX512_ICL>,
tuned_Unpack8SymAndRestorePlanarGradient8<CODEFEATURE_AVX512_ICL>,
tuned_Pack8SymWithDiff8<CODEFEATURE_AVX2>,
tuned_Unpack8SymWithDiff8<CODEFEATURE_AVX2>,
};


const TUNEDFUNC_CONVERT_PREDICT tfnConvertPredictSSE41 = {
&tfnConvertPredictCPP,
@@ -694,7 +703,7 @@ const TUNEDFUNC tfnRoot = {
#endif
&tfnConvertYUVRGBAVX2,
&tfnConvertShuffleAVX2,
&tfnSymPackAVX2,
&tfnSymPackAVX512ICL,
&tfnConvertPredictAVX1,
&tfnConvertSymPackAVX2,
};
@@ -0,0 +1,16 @@
/* 文字コードはSJIS 改行コードはCRLF */
/* $Id$ */

#include "stdafx.h"
#include "utvideo.h"
#include "TunedFunc.h"
#include "TunedFunc_x86x64.h"
#include "Predict.h"
#include "HuffmanCode.h"
#include "Convert.h"
#include "ColorOrder.h"
#include "Coefficient.h"

#define GENERATE_AVX512_ICL 1
#include "Predict_x86x64.cpp"
#include "SymPack_x86x64_zmm.cpp"
@@ -279,6 +279,12 @@
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="SymPack_x86x64_zmm.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="Thread.cpp" />
<ClCompile Include="TunedFunc.cpp" />
<ClCompile Include="TunedFunc_x86x64.cpp" />
@@ -318,6 +324,12 @@
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
<PrecompiledHeaderOutputFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)$(TargetName)-avx2.pch</PrecompiledHeaderOutputFile>
</ClCompile>
<ClCompile Include="TunedFunc_x86x64_avx512_icl.cpp">
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
</ClCompile>
<ClCompile Include="TunedFunc_x86x64_sse2.cpp">
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">StreamingSIMDExtensions2</EnableEnhancedInstructionSet>

0 comments on commit 9821315

Please sign in to comment.
You can’t perform that action at this time.