Eliminate buffer and copy for sifting codable coefficients

For large transforms we reorder coefficients from our forward transform so that the codable coefficients are first. Before this patch, we would do this by buffering the output forward transform then writing it out in the correct order. I'm a little suprised by the lack of impact in the microbenchmark results. I expected the copy to be inefficient enough to provide a performance boost when removed. It seems that a copy in the L1 cache just doesn't have a noticable impact. In the real world, performance does seem to improve on awcy by about half a percent. https://beta.arewecompressedyet.com/?job=master-redo-70dc401df%402020-01-19T14%3A02%3A47.242Z&job=embed_fwd_reorder%402020-01-19T14%3A25%3A07.898Z
xiph · Jan 21, 2020 · 37509b2 · 37509b2
1 parent 9110e82
commit 37509b2
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 70 deletions.
diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs
@@ -462,26 +462,44 @@ unsafe fn fwd_txfm2d_avx2<T: Coefficient>(
     // this size.
     match row_class {
       SizeClass1D::X8UP => {
-        for c in 0..txfm_size_col {
-          match T::Pixel::type_enum() {
-            PixelType::U8 => {
-              let lo = _mm256_castsi256_si128(row_coeffs[c].vec());
-              let hi = _mm256_extracti128_si256(row_coeffs[c].vec(), 1);
-              _mm_storeu_si128(
-                output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _,
-                _mm_packs_epi32(lo, hi),
-              );
-            }
-            PixelType::U16 => {
-              _mm256_storeu_si256(
-                output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _,
-                row_coeffs[c].vec(),
-              );
+        // Store output in at most 32x32 chunks. See native code for details.
+
+        // Output is grouped into 32x32 chunks so a stride of at most 32 is
+        // used for each chunk
+        let output_stride = txfm_size_row.min(32);
+
+        // Split the first 32 rows from the last 32 rows and offset by rg % 32
+        let output = &mut output[(rg & 31)
+          + (rg >= 32) as usize * output_stride * txfm_size_col.min(32)..];
+
+        for cg in (0..txfm_size_col).step_by(32) {
+          // Offset by zero or half of output
+          let output = &mut output[txfm_size_row * cg..];
+
+          for c in 0..txfm_size_col.min(32) {
+            match T::Pixel::type_enum() {
+              PixelType::U8 => {
+                let vec = row_coeffs[c + cg].vec();
+                let lo = _mm256_castsi256_si128(vec);
+                let hi = _mm256_extracti128_si256(vec, 1);
+                _mm_storeu_si128(
+                  output[c * output_stride..].as_mut_ptr() as *mut _,
+                  _mm_packs_epi32(lo, hi),
+                );
+              }
+              PixelType::U16 => {
+                _mm256_storeu_si256(
+                  output[c * output_stride..].as_mut_ptr() as *mut _,
+                  row_coeffs[c + cg].vec(),
+                );
+              }
             }
           }
         }
       }
       SizeClass1D::X4 => {
+        // Write out coefficients in normal order - it isn't possible to have
+        // more than 32 rows.
         for c in 0..txfm_size_col {
           match T::Pixel::type_enum() {
             PixelType::U8 => {

diff --git a/src/transform/forward.rs b/src/transform/forward.rs
@@ -153,8 +153,31 @@ pub mod native {
       let row_coeffs = &mut buf[r * txfm_size_col..];
       txfm_func_row(row_coeffs);
       av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]);
-      for c in 0..txfm_size_col {
-        output[c * txfm_size_row + r] = T::cast_from(row_coeffs[c]);
+
+      // Store output in at most 32x32 chunks so that the first 32x32
+      // coefficients are stored first. When we don't have 64 rows, there is no
+      // change in order. With 64 rows, the chunks are in this order
+      //  - First 32 rows and first 32 cols
+      //  - Last 32 rows and first 32 cols
+      //  - First 32 rows and last 32 cols
+      //  - Last 32 rows and last 32 cols
+
+      // Output is grouped into 32x32 chunks so a stride of at most 32 is
+      // used for each chunk.
+      let output_stride = txfm_size_row.min(32);
+
+      // Split the first 32 rows from the last 32 rows
+      let output = &mut output
+        [(r >= 32) as usize * output_stride * txfm_size_col.min(32)..];
+
+      for cg in (0..txfm_size_col).step_by(32) {
+        // Split the first 32 cols from the last 32 cols
+        let output = &mut output[txfm_size_row * cg..];
+
+        for c in 0..txfm_size_col.min(32) {
+          output[c * output_stride + (r & 31)] =
+            T::cast_from(row_coeffs[c + cg]);
+        }
       }
     }
   }
@@ -211,19 +234,7 @@ pub fn fht64x64<T: Coefficient>(
   bit_depth: usize, cpu: CpuFeatureLevel,
 ) {
   assert!(tx_type == TxType::DCT_DCT);
-  let mut aligned: AlignedArray<[T; 4096]> = AlignedArray::uninitialized();
-  let tmp = &mut aligned.array;
-
-  //Block64x64::fwd_txfm2d(input, &mut tmp, stride, tx_type, bit_depth, cpu);
-  Block64x64::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);
-
-  for i in 0..2 {
-    for (row_out, row_in) in
-      output[2048 * i..].chunks_mut(32).zip(tmp[32 * i..].chunks(64)).take(64)
-    {
-      row_out.copy_from_slice(&row_in[..32]);
-    }
-  }
+  Block64x64::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
 }
 
 pub fn fht4x8<T: Coefficient>(
@@ -275,33 +286,15 @@ pub fn fht32x64<T: Coefficient>(
   bit_depth: usize, cpu: CpuFeatureLevel,
 ) {
   assert!(tx_type == TxType::DCT_DCT);
-  let mut aligned: AlignedArray<[T; 2048]> = AlignedArray::uninitialized();
-  let tmp = &mut aligned.array;
-
-  Block32x64::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);
-
-  for i in 0..2 {
-    for (row_out, row_in) in
-      output[1024 * i..].chunks_mut(32).zip(tmp[32 * i..].chunks(64)).take(32)
-    {
-      row_out.copy_from_slice(&row_in[..32]);
-    }
-  }
+  Block32x64::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
 }
 
 pub fn fht64x32<T: Coefficient>(
   input: &[i16], output: &mut [T], stride: usize, tx_type: TxType,
   bit_depth: usize, cpu: CpuFeatureLevel,
 ) {
   assert!(tx_type == TxType::DCT_DCT);
-  let mut aligned: AlignedArray<[T; 2048]> = AlignedArray::uninitialized();
-  let tmp = &mut aligned.array;
-
-  Block64x32::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);
-
-  for (row_out, row_in) in output.chunks_mut(32).zip(tmp.chunks(32)).take(64) {
-    row_out.copy_from_slice(&row_in[..32]);
-  }
+  Block64x32::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
 }
 
 pub fn fht4x16<T: Coefficient>(
@@ -339,31 +332,13 @@ pub fn fht16x64<T: Coefficient>(
   bit_depth: usize, cpu: CpuFeatureLevel,
 ) {
   assert!(tx_type == TxType::DCT_DCT);
-  let mut aligned: AlignedArray<[T; 1024]> = AlignedArray::uninitialized();
-  let tmp = &mut aligned.array;
-
-  Block16x64::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);
-
-  for i in 0..2 {
-    for (row_out, row_in) in
-      output[512 * i..].chunks_mut(32).zip(tmp[32 * i..].chunks(64)).take(16)
-    {
-      row_out.copy_from_slice(&row_in[..32]);
-    }
-  }
+  Block16x64::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
 }
 
 pub fn fht64x16<T: Coefficient>(
   input: &[i16], output: &mut [T], stride: usize, tx_type: TxType,
   bit_depth: usize, cpu: CpuFeatureLevel,
 ) {
   assert!(tx_type == TxType::DCT_DCT);
-  let mut aligned: AlignedArray<[T; 1024]> = AlignedArray::uninitialized();
-  let tmp = &mut aligned.array;
-
-  Block64x16::fwd_txfm2d_daala(input, tmp, stride, tx_type, bit_depth, cpu);
-
-  for (row_out, row_in) in output.chunks_mut(16).zip(tmp.chunks(16)).take(64) {
-    row_out.copy_from_slice(&row_in[..16]);
-  }
+  Block64x16::fwd_txfm2d_daala(input, output, stride, tx_type, bit_depth, cpu);
 }