Skip to content

Commit

Permalink
Add x86 assembly for cdef distortion
Browse files Browse the repository at this point in the history
Around ~7% faster on speed 2. It should be roughly the same for most
speed levels.

It would have been more, but some recent patches seem to have allowed
autovecterization to happen again.
  • Loading branch information
KyleSiefring committed Jan 6, 2022
1 parent 45cc617 commit 5da2a86
Show file tree
Hide file tree
Showing 8 changed files with 754 additions and 249 deletions.
2 changes: 2 additions & 0 deletions benches/rdo.rs
Expand Up @@ -16,6 +16,7 @@ fn init_plane_u8(width: usize, height: usize, seed: u8) -> Plane<u8> {
}

pub fn cdef_dist_wxh_8x8(c: &mut Criterion) {
let cpu = CpuFeatureLevel::default();
let src1 = init_plane_u8(8, 8, 1);
let src2 = init_plane_u8(8, 8, 2);

Expand All @@ -28,6 +29,7 @@ pub fn cdef_dist_wxh_8x8(c: &mut Criterion) {
8,
8,
|_, _| DistortionScale::default(),
cpu,
)
})
});
Expand Down
1 change: 1 addition & 0 deletions build.rs
Expand Up @@ -111,6 +111,7 @@ fn build_nasm_files() {
"src/x86/sad_sse2.asm",
"src/x86/sad_avx.asm",
"src/x86/satd.asm",
"src/x86/cdef_dist.asm",
"src/x86/sse.asm",
"src/x86/cdef_rav1e.asm",
"src/x86/cdef_sse.asm",
Expand Down
151 changes: 149 additions & 2 deletions src/activity.rs
@@ -1,4 +1,4 @@
// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved
// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
//
// This source code is subject to the terms of the BSD 2 Clause License and
// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
Expand All @@ -8,7 +8,7 @@
// PATENTS file, you can obtain it at www.aomedia.org/license/patent.

use crate::frame::*;
use crate::rdo::{ssim_boost, DistortionScale};
use crate::rdo::DistortionScale;
use crate::tiling::*;
use crate::util::*;
use itertools::izip;
Expand Down Expand Up @@ -98,3 +98,150 @@ fn variance_8x8<T: Pixel>(src: &PlaneRegion<'_, T>) -> u32 {
// Use sums to calculate variance
sum_s2 - ((sum_s * sum_s + 32) >> 6)
}

/// rsqrt result stored in fixed point w/ scaling such that:
/// rsqrt = output.rsqrt_norm / (1 << output.shift)
struct RsqrtOutput {
norm: u16,
shift: u8,
}

/// Fixed point rsqrt for ssim_boost
fn ssim_boost_rsqrt(x: u64) -> RsqrtOutput {
const INSHIFT: u8 = 16;
const OUTSHIFT: u8 = 14;

let k = ((x.ilog() - 1) >> 1) as i16;
/*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s).
Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/
let s: i16 = 2 * k - (INSHIFT as i16 - 2);
let t: u16 = if s > 0 { x >> s } else { x << -s } as u16;

/*We want to express od_rsqrt() in terms of od_rsqrt_norm(), which is
defined as (2^OUTSHIFT)/sqrt(t*(2^-INSHIFT)) with t=x*(2^-s).
This simplifies to 2^(OUTSHIFT+(INSHIFT/2)+(s/2))/sqrt(x), so the caller
needs to shift right by OUTSHIFT + INSHIFT/2 + s/2.*/
let rsqrt_shift: u8 = (OUTSHIFT as i16 + ((s + INSHIFT as i16) >> 1)) as u8;

#[inline(always)]
fn mult16_16_q15(a: i32, b: i32) -> i32 {
(a * b) >> 15
}

/* Reciprocal sqrt approximation where the input is in the range [0.25,1) in
Q16 and the output is in the range (1.0, 2.0] in Q14). */

/* Range of n is [-16384,32767] ([-0.5,1) in Q15). */
let n: i32 = t as i32 - 32768;
debug_assert!(n >= -16384);

/* Get a rough guess for the root.
The optimal minimax quadratic approximation (using relative error) is
r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485).
Coefficients here, and the final result r, are Q14. */
let rsqrt: i32 = 23557 + mult16_16_q15(n, -13490 + mult16_16_q15(n, 6711));

debug_assert!((16384..32768).contains(&rsqrt));
RsqrtOutput { norm: rsqrt as u16, shift: rsqrt_shift }
}

#[inline(always)]
pub fn ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> DistortionScale {
DistortionScale(apply_ssim_boost(
DistortionScale::default().0,
svar,
dvar,
bit_depth,
))
}

/// Apply ssim boost to a given input
#[inline(always)]
pub fn apply_ssim_boost(
input: u32, svar: u32, dvar: u32, bit_depth: usize,
) -> u32 {
let coeff_shift = bit_depth - 8;

// Scale dvar and svar to lbd range to prevent overflows.
let svar = (svar >> (2 * coeff_shift)) as u64;
let dvar = (dvar >> (2 * coeff_shift)) as u64;

// The two constants were tuned for CDEF, but can probably be better tuned
// for use in general RDO
const C1: u64 = 4033;
const C2: u64 = 16384;
const RATIO_SHIFT: u8 = 14;
const RATIO: u64 = (((C1 << (RATIO_SHIFT + 1)) / C2) + 1) >> 1;

// C1 (svar + dvar + C2)
// input * ---- * --------------------------
// C2 sqrt(C1^2 + svar * dvar)
let rsqrt = ssim_boost_rsqrt((C1 * C1) + svar * dvar);
((input as u64
* (((RATIO * (svar + dvar + C2) as u64) * rsqrt.norm as u64)
>> RATIO_SHIFT))
>> rsqrt.shift) as u32
}

#[cfg(test)]
mod ssim_boost_tests {
use super::*;
use rand::Rng;

/// Test to make sure extreme values of ssim boost don't overflow.
#[test]
fn overflow_test() {
// Test variance for 8x8 region with a bit depth of 12
let max_pix_diff = (1 << 12) - 1;
let max_pix_sse = max_pix_diff * max_pix_diff;
let max_variance = max_pix_diff * 8 * 8 / 4;
apply_ssim_boost(max_pix_sse * 8 * 8, max_variance, max_variance, 12);
}

/// Floating point reference version of ssim_boost
fn reference_ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> f64 {
let coeff_shift = bit_depth - 8;
let var_scale = 1f64 / (1 << (2 * coeff_shift)) as f64;
let svar = svar as f64 * var_scale;
let dvar = dvar as f64 * var_scale;
// These constants are from ssim boost and need to be updated if the
// constants in ssim boost change.
const C1: f64 = 4033f64;
const C2: f64 = 16384f64;
const RATIO: f64 = C1 / C2;

RATIO * (svar + dvar + C2) / f64::sqrt(C1 * C1 + svar * dvar)
}

/// Test that ssim_boost has sufficient accuracy.
#[test]
fn accuracy_test() {
let mut rng = rand::thread_rng();

let mut max_relative_error = 0f64;
let bd = 12;

// Test different log scale ranges for the variance.
// Each scale is tested multiple times with randomized variances.
for scale in 0..(bd + 3 * 2 - 2) {
for _ in 0..40 {
let svar = rng.gen_range(0..(1 << scale));
let dvar = rng.gen_range(0..(1 << scale));

let float = reference_ssim_boost(svar, dvar, 12);
let fixed =
apply_ssim_boost(1 << 23, svar, dvar, 12) as f64 / (1 << 23) as f64;

// Compare the two versions
max_relative_error =
max_relative_error.max(f64::abs(1f64 - fixed / float));
}
}

assert!(
max_relative_error < 0.05,
"SSIM boost error too high. Measured max relative error: {}.",
max_relative_error
);
}
}

0 comments on commit 5da2a86

Please sign in to comment.