Add x86 assembly for cdef distortion

Around ~7% faster on speed 2. It should be roughly the same for most speed levels. It would have been more, but some recent patches seem to have allowed autovecterization to happen again.
xiph · Jan 6, 2022 · 5da2a86 · 5da2a86
1 parent 45cc617
commit 5da2a86
Show file tree

Hide file tree

Showing 8 changed files with 754 additions and 249 deletions.
diff --git a/benches/rdo.rs b/benches/rdo.rs
@@ -16,6 +16,7 @@ fn init_plane_u8(width: usize, height: usize, seed: u8) -> Plane<u8> {
 }
 
 pub fn cdef_dist_wxh_8x8(c: &mut Criterion) {
+  let cpu = CpuFeatureLevel::default();
   let src1 = init_plane_u8(8, 8, 1);
   let src2 = init_plane_u8(8, 8, 2);
 
@@ -28,6 +29,7 @@ pub fn cdef_dist_wxh_8x8(c: &mut Criterion) {
         8,
         8,
         |_, _| DistortionScale::default(),
+        cpu,
       )
     })
   });

diff --git a/build.rs b/build.rs
@@ -111,6 +111,7 @@ fn build_nasm_files() {
     "src/x86/sad_sse2.asm",
     "src/x86/sad_avx.asm",
     "src/x86/satd.asm",
+    "src/x86/cdef_dist.asm",
     "src/x86/sse.asm",
     "src/x86/cdef_rav1e.asm",
     "src/x86/cdef_sse.asm",

diff --git a/src/activity.rs b/src/activity.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved
+// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
 //
 // This source code is subject to the terms of the BSD 2 Clause License and
 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -8,7 +8,7 @@
 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 
 use crate::frame::*;
-use crate::rdo::{ssim_boost, DistortionScale};
+use crate::rdo::DistortionScale;
 use crate::tiling::*;
 use crate::util::*;
 use itertools::izip;
@@ -98,3 +98,150 @@ fn variance_8x8<T: Pixel>(src: &PlaneRegion<'_, T>) -> u32 {
   // Use sums to calculate variance
   sum_s2 - ((sum_s * sum_s + 32) >> 6)
 }
+
+/// rsqrt result stored in fixed point w/ scaling such that:
+///   rsqrt = output.rsqrt_norm / (1 << output.shift)
+struct RsqrtOutput {
+  norm: u16,
+  shift: u8,
+}
+
+/// Fixed point rsqrt for ssim_boost
+fn ssim_boost_rsqrt(x: u64) -> RsqrtOutput {
+  const INSHIFT: u8 = 16;
+  const OUTSHIFT: u8 = 14;
+
+  let k = ((x.ilog() - 1) >> 1) as i16;
+  /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
+  Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
+  let s: i16 = 2 * k - (INSHIFT as i16 - 2);
+  let t: u16 = if s > 0 { x >> s } else { x << -s } as u16;
+
+  /*We want to express od_rsqrt() in terms of od_rsqrt_norm(), which is
+   defined as (2^OUTSHIFT)/sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
+  This simplifies to 2^(OUTSHIFT+(INSHIFT/2)+(s/2))/sqrt(x), so the caller
+   needs to shift right by OUTSHIFT + INSHIFT/2 + s/2.*/
+  let rsqrt_shift: u8 = (OUTSHIFT as i16 + ((s + INSHIFT as i16) >> 1)) as u8;
+
+  #[inline(always)]
+  fn mult16_16_q15(a: i32, b: i32) -> i32 {
+    (a * b) >> 15
+  }
+
+  /* Reciprocal sqrt approximation where the input is in the range [0.25,1) in
+  Q16 and the output is in the range (1.0, 2.0] in Q14). */
+
+  /* Range of n is [-16384,32767] ([-0.5,1) in Q15). */
+  let n: i32 = t as i32 - 32768;
+  debug_assert!(n >= -16384);
+
+  /* Get a rough guess for the root.
+  The optimal minimax quadratic approximation (using relative error) is
+   r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
+  Coefficients here, and the final result r, are Q14. */
+  let rsqrt: i32 = 23557 + mult16_16_q15(n, -13490 + mult16_16_q15(n, 6711));
+
+  debug_assert!((16384..32768).contains(&rsqrt));
+  RsqrtOutput { norm: rsqrt as u16, shift: rsqrt_shift }
+}
+
+#[inline(always)]
+pub fn ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> DistortionScale {
+  DistortionScale(apply_ssim_boost(
+    DistortionScale::default().0,
+    svar,
+    dvar,
+    bit_depth,
+  ))
+}
+
+/// Apply ssim boost to a given input
+#[inline(always)]
+pub fn apply_ssim_boost(
+  input: u32, svar: u32, dvar: u32, bit_depth: usize,
+) -> u32 {
+  let coeff_shift = bit_depth - 8;
+
+  // Scale dvar and svar to lbd range to prevent overflows.
+  let svar = (svar >> (2 * coeff_shift)) as u64;
+  let dvar = (dvar >> (2 * coeff_shift)) as u64;
+
+  // The two constants were tuned for CDEF, but can probably be better tuned
+  //   for use in general RDO
+  const C1: u64 = 4033;
+  const C2: u64 = 16384;
+  const RATIO_SHIFT: u8 = 14;
+  const RATIO: u64 = (((C1 << (RATIO_SHIFT + 1)) / C2) + 1) >> 1;
+
+  //          C1        (svar + dvar + C2)
+  // input * ---- * --------------------------
+  //          C2     sqrt(C1^2 + svar * dvar)
+  let rsqrt = ssim_boost_rsqrt((C1 * C1) + svar * dvar);
+  ((input as u64
+    * (((RATIO * (svar + dvar + C2) as u64) * rsqrt.norm as u64)
+      >> RATIO_SHIFT))
+    >> rsqrt.shift) as u32
+}
+
+#[cfg(test)]
+mod ssim_boost_tests {
+  use super::*;
+  use rand::Rng;
+
+  /// Test to make sure extreme values of ssim boost don't overflow.
+  #[test]
+  fn overflow_test() {
+    // Test variance for 8x8 region with a bit depth of 12
+    let max_pix_diff = (1 << 12) - 1;
+    let max_pix_sse = max_pix_diff * max_pix_diff;
+    let max_variance = max_pix_diff * 8 * 8 / 4;
+    apply_ssim_boost(max_pix_sse * 8 * 8, max_variance, max_variance, 12);
+  }
+
+  /// Floating point reference version of ssim_boost
+  fn reference_ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> f64 {
+    let coeff_shift = bit_depth - 8;
+    let var_scale = 1f64 / (1 << (2 * coeff_shift)) as f64;
+    let svar = svar as f64 * var_scale;
+    let dvar = dvar as f64 * var_scale;
+    // These constants are from ssim boost and need to be updated if the
+    //  constants in ssim boost change.
+    const C1: f64 = 4033f64;
+    const C2: f64 = 16384f64;
+    const RATIO: f64 = C1 / C2;
+
+    RATIO * (svar + dvar + C2) / f64::sqrt(C1 * C1 + svar * dvar)
+  }
+
+  /// Test that ssim_boost has sufficient accuracy.
+  #[test]
+  fn accuracy_test() {
+    let mut rng = rand::thread_rng();
+
+    let mut max_relative_error = 0f64;
+    let bd = 12;
+
+    // Test different log scale ranges for the variance.
+    // Each scale is tested multiple times with randomized variances.
+    for scale in 0..(bd + 3 * 2 - 2) {
+      for _ in 0..40 {
+        let svar = rng.gen_range(0..(1 << scale));
+        let dvar = rng.gen_range(0..(1 << scale));
+
+        let float = reference_ssim_boost(svar, dvar, 12);
+        let fixed =
+          apply_ssim_boost(1 << 23, svar, dvar, 12) as f64 / (1 << 23) as f64;
+
+        // Compare the two versions
+        max_relative_error =
+          max_relative_error.max(f64::abs(1f64 - fixed / float));
+      }
+    }
+
+    assert!(
+      max_relative_error < 0.05,
+      "SSIM boost error too high. Measured max relative error: {}.",
+      max_relative_error
+    );
+  }
+}