Ease auto vectorization of tx distortion computation

The compiler was not autovectorizing this section of code. Switches the code to doing two seperate iteration on the input coefficients from using repeat and chain. The first iteration processes the stored reconstructed coefficients and the second processes the reconstructed coefficients that can be assumed to be zero. This isn't used by default, but Thomas is working on a project that uses it.
xiph · Jan 15, 2020 · ea608b4 · ea608b4
1 parent bdef061
commit ea608b4
Showing 1 changed file with 16 additions and 8 deletions.
diff --git a/src/encoder.rs b/src/encoder.rs
@@ -1279,18 +1279,26 @@ pub fn encode_tx_block<T: Pixel>(
 
   let tx_dist = if rdo_type.needs_tx_dist() {
     // Store tx-domain distortion of this block
+    // rcoeffs above 32 rows/cols aren't held in the array, because they are
+    // always 0. The first 32x32 is stored first in coeffs so we can iterate
+    // over coeffs and rcoeffs for the first 32 rows/cols. For the
+    // coefficients above 32 rows/cols, we iterate over the rest of coeffs
+    // with the assumption that rcoeff coefficients are zero.
     let mut raw_tx_dist = coeffs
       .iter()
-      .zip(
-        // rcoeffs above 32 rows/cols are always 0. The first 32x32 is stored
-        // first in coeffs, so just chain repeating zeroes to rcoeff.
-        rcoeffs.iter().chain(std::iter::repeat(&T::Coeff::cast_from(0))),
-      )
-      .map(|(a, b)| {
-        let c = i32::cast_from(*a) - i32::cast_from(*b);
+      .zip(rcoeffs.iter())
+      .map(|(&a, &b)| {
+        let c = i32::cast_from(a) - i32::cast_from(b);
         (c * c) as u64
       })
-      .sum::<u64>();
+      .sum::<u64>()
+      + coeffs[rcoeffs.len()..]
+        .iter()
+        .map(|&a| {
+          let c = i32::cast_from(a);
+          (c * c) as u64
+        })
+        .sum::<u64>();
 
     let tx_dist_scale_bits = 2 * (3 - get_log_tx_scale(tx_size));
     let tx_dist_scale_rounding_offset = 1 << (tx_dist_scale_bits - 1);