Skip to content

Commit

Permalink
Eliminate buffer and copy for sifting codable coefficients
Browse files Browse the repository at this point in the history
For large transforms we reorder coefficients from our forward transform
so that the codable coefficients are first. Before this patch, we would
do this by buffering the output forward transform then writing it out
in the correct order.

I'm a little suprised by the lack of impact in the microbenchmark
results. I expected the copy to be inefficient enough to provide a
performance boost when removed. It seems that a copy in the L1 cache
just doesn't have a noticable impact.

In the real world, performance does seem to improve on awcy by about
half a percent.

https://beta.arewecompressedyet.com/?job=master-redo-70dc401df%402020-01-19T14%3A02%3A47.242Z&job=embed_fwd_reorder%402020-01-19T14%3A25%3A07.898Z
  • Loading branch information
KyleSiefring committed Jan 21, 2020
1 parent 9110e82 commit 37509b2
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 70 deletions.
48 changes: 33 additions & 15 deletions src/asm/x86/transform/forward.rs
Original file line number Diff line number Diff line change
Expand Up @@ -462,26 +462,44 @@ unsafe fn fwd_txfm2d_avx2<T: Coefficient>(
// this size.
match row_class {
SizeClass1D::X8UP => {
for c in 0..txfm_size_col {
match T::Pixel::type_enum() {
PixelType::U8 => {
let lo = _mm256_castsi256_si128(row_coeffs[c].vec());
let hi = _mm256_extracti128_si256(row_coeffs[c].vec(), 1);
_mm_storeu_si128(
output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _,
_mm_packs_epi32(lo, hi),
);
}
PixelType::U16 => {
_mm256_storeu_si256(
output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _,
row_coeffs[c].vec(),
);
// Store output in at most 32x32 chunks. See native code for details.

// Output is grouped into 32x32 chunks so a stride of at most 32 is
// used for each chunk
let output_stride = txfm_size_row.min(32);

// Split the first 32 rows from the last 32 rows and offset by rg % 32
let output = &mut output[(rg & 31)
+ (rg >= 32) as usize * output_stride * txfm_size_col.min(32)..];

for cg in (0..txfm_size_col).step_by(32) {
// Offset by zero or half of output
let output = &mut output[txfm_size_row * cg..];

for c in 0..txfm_size_col.min(32) {
match T::Pixel::type_enum() {
PixelType::U8 => {
let vec = row_coeffs[c + cg].vec();
let lo = _mm256_castsi256_si128(vec);
let hi = _mm256_extracti128_si256(vec, 1);
_mm_storeu_si128(
output[c * output_stride..].as_mut_ptr() as *mut _,
_mm_packs_epi32(lo, hi),
);
}
PixelType::U16 => {
_mm256_storeu_si256(
output[c * output_stride..].as_mut_ptr() as *mut _,
row_coeffs[c + cg].vec(),
);
}
}
}
}
}
SizeClass1D::X4 => {
// Write out coefficients in normal order - it isn't possible to have
// more than 32 rows.
for c in 0..txfm_size_col {
match T::Pixel::type_enum() {
PixelType::U8 => {
Expand Down
85 changes: 30 additions & 55 deletions src/transform/forward.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,31 @@ pub mod native {
let row_coeffs = &mut buf[r * txfm_size_col..];
txfm_func_row(row_coeffs);
av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]);
for c in 0..txfm_size_col {
output[c * txfm_size_row + r] = T::cast_from(row_coeffs[c]);

// Store output in at most 32x32 chunks so that the first 32x32
// coefficients are stored first. When we don't have 64 rows, there is no
// change in order. With 64 rows, the chunks are in this order
// - First 32 rows and first 32 cols
// - Last 32 rows and first 32 cols
// - First 32 rows and last 32 cols
// - Last 32 rows and last 32 cols

// Output is grouped into 32x32 chunks so a stride of at most 32 is
// used for each chunk.
let output_stride = txfm_size_row.min(32);

// Split the first 32 rows from the last 32 rows
let output = &mut output
[(r >= 32) as usize * output_stride * txfm_size_col.min(32)..];

for cg in (0..txfm_size_col).step_by(32) {
// Split the first 32 cols from the last 32 cols
let output = &mut output[txfm_size_row * cg..];

for c in 0..txfm_size_col.min(32) {
output[c * output_stride + (r & 31)] =
T::cast_from(row_coeffs[c + cg]);
}
}
}
}
Expand Down Expand Up @@ -211,19 +234,7 @@ pub fn fht64x64<T: Coefficient>(
bit_depth: usize, cpu: CpuFeatureLevel,
) {
assert!(tx_type == TxType::DCT_DCT);
let mut aligned: AlignedArray<[T; 4096]> = AlignedArray::uninitialized();
let tmp = &mut aligned.array;

//Block64x64::fwd_txfm2d(input, &mut tmp, stride, tx_type, bit_depth, cpu);
Block64x64::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);

for i in 0..2 {
for (row_out, row_in) in
output[2048 * i..].chunks_mut(32).zip(tmp[32 * i..].chunks(64)).take(64)
{
row_out.copy_from_slice(&row_in[..32]);
}
}
Block64x64::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
}

pub fn fht4x8<T: Coefficient>(
Expand Down Expand Up @@ -275,33 +286,15 @@ pub fn fht32x64<T: Coefficient>(
bit_depth: usize, cpu: CpuFeatureLevel,
) {
assert!(tx_type == TxType::DCT_DCT);
let mut aligned: AlignedArray<[T; 2048]> = AlignedArray::uninitialized();
let tmp = &mut aligned.array;

Block32x64::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);

for i in 0..2 {
for (row_out, row_in) in
output[1024 * i..].chunks_mut(32).zip(tmp[32 * i..].chunks(64)).take(32)
{
row_out.copy_from_slice(&row_in[..32]);
}
}
Block32x64::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
}

pub fn fht64x32<T: Coefficient>(
input: &[i16], output: &mut [T], stride: usize, tx_type: TxType,
bit_depth: usize, cpu: CpuFeatureLevel,
) {
assert!(tx_type == TxType::DCT_DCT);
let mut aligned: AlignedArray<[T; 2048]> = AlignedArray::uninitialized();
let tmp = &mut aligned.array;

Block64x32::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);

for (row_out, row_in) in output.chunks_mut(32).zip(tmp.chunks(32)).take(64) {
row_out.copy_from_slice(&row_in[..32]);
}
Block64x32::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
}

pub fn fht4x16<T: Coefficient>(
Expand Down Expand Up @@ -339,31 +332,13 @@ pub fn fht16x64<T: Coefficient>(
bit_depth: usize, cpu: CpuFeatureLevel,
) {
assert!(tx_type == TxType::DCT_DCT);
let mut aligned: AlignedArray<[T; 1024]> = AlignedArray::uninitialized();
let tmp = &mut aligned.array;

Block16x64::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);

for i in 0..2 {
for (row_out, row_in) in
output[512 * i..].chunks_mut(32).zip(tmp[32 * i..].chunks(64)).take(16)
{
row_out.copy_from_slice(&row_in[..32]);
}
}
Block16x64::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
}

pub fn fht64x16<T: Coefficient>(
input: &[i16], output: &mut [T], stride: usize, tx_type: TxType,
bit_depth: usize, cpu: CpuFeatureLevel,
) {
assert!(tx_type == TxType::DCT_DCT);
let mut aligned: AlignedArray<[T; 1024]> = AlignedArray::uninitialized();
let tmp = &mut aligned.array;

Block64x16::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);

for (row_out, row_in) in output.chunks_mut(16).zip(tmp.chunks(16)).take(64) {
row_out.copy_from_slice(&row_in[..16]);
}
Block64x16::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
}

0 comments on commit 37509b2

Please sign in to comment.