Reland^2 "[bigint] Karatsuba multiplication"

This is a reland of 81dd3f4, which was a reland of 59eff3b Original change's description: > [bigint] Karatsuba multiplication > > The Karatsuba algorithm is used for BigInts with 34 or more internal > digits, and thanks to better asymptotic complexity provides greater > speedups the bigger the inputs. > > Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2782283 > Reviewed-by: Michael Achenbach <machenbach@chromium.org> > Reviewed-by: Thibaud Michaud <thibaudm@chromium.org> > Cr-Commit-Position: refs/heads/master@{#74916} Bug: v8:11515 Change-Id: I08f7d59dfa39fb3b532684685afd9fa750e0e84e Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2933666 Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Michael Achenbach <machenbach@chromium.org> Commit-Queue: Jakob Kummerow <jkummerow@chromium.org> Cr-Commit-Position: refs/heads/master@{#74969}
v8 · Jun 7, 2021 · df7f886 · df7f886
1 parent d1a0896
commit df7f886
Show file tree

Hide file tree

Showing 16 changed files with 716 additions and 6 deletions.
diff --git a/BUILD.gn b/BUILD.gn
@@ -4899,7 +4899,9 @@ v8_source_set("v8_bigint") {
     "src/bigint/bigint-internal.h",
     "src/bigint/bigint.h",
     "src/bigint/digit-arithmetic.h",
+    "src/bigint/mul-karatsuba.cc",
     "src/bigint/mul-schoolbook.cc",
+    "src/bigint/util.h",
     "src/bigint/vector-arithmetic.cc",
     "src/bigint/vector-arithmetic.h",
   ]

diff --git a/src/bigint/bigint-internal.cc b/src/bigint/bigint-internal.cc
@@ -30,7 +30,8 @@ void ProcessorImpl::Multiply(RWDigits Z, Digits X, Digits Y) {
   if (X.len() == 0 || Y.len() == 0) return Z.Clear();
   if (X.len() < Y.len()) std::swap(X, Y);
   if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]);
-  return MultiplySchoolbook(Z, X, Y);
+  if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y);
+  return MultiplyKaratsuba(Z, X, Y);
 }
 
 Status Processor::Multiply(RWDigits Z, Digits X, Digits Y) {

diff --git a/src/bigint/bigint-internal.h b/src/bigint/bigint-internal.h
@@ -10,6 +10,8 @@
 namespace v8 {
 namespace bigint {
 
+constexpr int kKaratsubaThreshold = 34;
+
 class ProcessorImpl : public Processor {
  public:
   explicit ProcessorImpl(Platform* platform);
@@ -21,6 +23,11 @@ class ProcessorImpl : public Processor {
   void MultiplySingle(RWDigits Z, Digits X, digit_t y);
   void MultiplySchoolbook(RWDigits Z, Digits X, Digits Y);
 
+  void MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y);
+  void KaratsubaStart(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int k);
+  void KaratsubaChunk(RWDigits Z, Digits X, Digits Y, RWDigits scratch);
+  void KaratsubaMain(RWDigits Z, Digits X, Digits Y, RWDigits scratch, int n);
+
  private:
   // Each unit is supposed to represent approximately one CPU {mul} instruction.
   // Doesn't need to be accurate; we just want to make sure to check for
@@ -59,6 +66,28 @@ class ProcessorImpl : public Processor {
 #define DCHECK(cond) (void(0))
 #endif
 
+// RAII memory for a Digits array.
+class Storage {
+ public:
+  explicit Storage(int count) : ptr_(new digit_t[count]) {}
+
+  digit_t* get() { return ptr_.get(); }
+
+ private:
+  std::unique_ptr<digit_t[]> ptr_;
+};
+
+// A writable Digits array with attached storage.
+class ScratchDigits : public RWDigits {
+ public:
+  explicit ScratchDigits(int len) : RWDigits(nullptr, len), storage_(len) {
+    digits_ = storage_.get();
+  }
+
+ private:
+  Storage storage_;
+};
+
 }  // namespace bigint
 }  // namespace v8
 

diff --git a/src/bigint/digit-arithmetic.h b/src/bigint/digit-arithmetic.h
@@ -45,6 +45,36 @@ inline digit_t digit_add3(digit_t a, digit_t b, digit_t c, digit_t* carry) {
 #endif
 }
 
+// {borrow} will be set to 0 or 1.
+inline digit_t digit_sub(digit_t a, digit_t b, digit_t* borrow) {
+#if HAVE_TWODIGIT_T
+  twodigit_t result = twodigit_t{a} - b;
+  *borrow = (result >> kDigitBits) & 1;
+  return static_cast<digit_t>(result);
+#else
+  digit_t result = a - b;
+  *borrow = (result > a) ? 1 : 0;
+  return result;
+#endif
+}
+
+// {borrow_out} will be set to 0 or 1.
+inline digit_t digit_sub2(digit_t a, digit_t b, digit_t borrow_in,
+                          digit_t* borrow_out) {
+#if HAVE_TWODIGIT_T
+  twodigit_t subtrahend = twodigit_t{b} + borrow_in;
+  twodigit_t result = twodigit_t{a} - subtrahend;
+  *borrow_out = (result >> kDigitBits) & 1;
+  return static_cast<digit_t>(result);
+#else
+  digit_t result = a - b;
+  *borrow_out = (result > a) ? 1 : 0;
+  if (result < borrow_in) *borrow_out += 1;
+  result -= borrow_in;
+  return result;
+#endif
+}
+
 // Returns the low half of the result. High half is in {high}.
 inline digit_t digit_mul(digit_t a, digit_t b, digit_t* high) {
 #if HAVE_TWODIGIT_T

diff --git a/src/bigint/mul-karatsuba.cc b/src/bigint/mul-karatsuba.cc
@@ -0,0 +1,189 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Karatsuba multiplication. This is loosely based on Go's implementation
+// found at https://golang.org/src/math/big/nat.go, licensed as follows:
+//
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file [1].
+//
+// [1] https://golang.org/LICENSE
+
+#include <algorithm>
+#include <utility>
+
+#include "src/bigint/bigint-internal.h"
+#include "src/bigint/digit-arithmetic.h"
+#include "src/bigint/util.h"
+#include "src/bigint/vector-arithmetic.h"
+
+namespace v8 {
+namespace bigint {
+
+namespace {
+
+// The Karatsuba algorithm sometimes finishes more quickly when the
+// input length is rounded up a bit. This method encodes some heuristics
+// to accomplish this. The details have been determined experimentally.
+int RoundUpLen(int len) {
+  if (len <= 36) return RoundUp(len, 2);
+  // Keep the 4 or 5 most significant non-zero bits.
+  int shift = BitLength(len) - 5;
+  if ((len >> shift) >= 0x18) {
+    shift++;
+  }
+  // Round up, unless we're only just above the threshold. This smoothes
+  // the steps by which time goes up as input size increases.
+  int additive = ((1 << shift) - 1);
+  if (shift >= 2 && (len & additive) < (1 << (shift - 2))) {
+    return len;
+  }
+  return ((len + additive) >> shift) << shift;
+}
+
+// This method makes the final decision how much to bump up the input size.
+int KaratsubaLength(int n) {
+  n = RoundUpLen(n);
+  int i = 0;
+  while (n > kKaratsubaThreshold) {
+    n >>= 1;
+    i++;
+  }
+  return n << i;
+}
+
+// Performs the specific subtraction required by {KaratsubaMain} below.
+void KaratsubaSubtractionHelper(RWDigits result, Digits X, Digits Y,
+                                int* sign) {
+  X.Normalize();
+  Y.Normalize();
+  digit_t borrow = 0;
+  int i = 0;
+  if (!GreaterThanOrEqual(X, Y)) {
+    *sign = -(*sign);
+    std::swap(X, Y);
+  }
+  for (; i < Y.len(); i++) {
+    result[i] = digit_sub2(X[i], Y[i], borrow, &borrow);
+  }
+  for (; i < X.len(); i++) {
+    result[i] = digit_sub(X[i], borrow, &borrow);
+  }
+  DCHECK(borrow == 0);  // NOLINT(readability/check)
+  for (; i < result.len(); i++) result[i] = 0;
+}
+
+}  // namespace
+
+void ProcessorImpl::MultiplyKaratsuba(RWDigits Z, Digits X, Digits Y) {
+  DCHECK(X.len() >= Y.len());
+  DCHECK(Y.len() >= kKaratsubaThreshold);
+  DCHECK(Z.len() >= X.len() + Y.len());
+  int k = KaratsubaLength(Y.len());
+  int scratch_len = 4 * k;
+  ScratchDigits scratch(scratch_len);
+  KaratsubaStart(Z, X, Y, scratch, k);
+}
+
+// Entry point for Karatsuba-based multiplication, takes care of inputs
+// with unequal lengths by chopping the larger into chunks.
+void ProcessorImpl::KaratsubaStart(RWDigits Z, Digits X, Digits Y,
+                                   RWDigits scratch, int k) {
+  KaratsubaMain(Z, X, Y, scratch, k);
+  if (should_terminate()) return;
+  for (int i = 2 * k; i < Z.len(); i++) Z[i] = 0;
+  if (k < Y.len() || X.len() != Y.len()) {
+    ScratchDigits T(2 * k);
+    // Add X0 * Y1 * b.
+    Digits X0(X, 0, k);
+    Digits Y1 = Y + std::min(k, Y.len());
+    if (Y1.len() > 0) {
+      KaratsubaChunk(T, X0, Y1, scratch);
+      if (should_terminate()) return;
+      AddAt(Z + k, T);
+    }
+
+    // Add Xi * Y0 << i and Xi * Y1 * b << (i + k).
+    Digits Y0(Y, 0, k);
+    for (int i = k; i < X.len(); i += k) {
+      Digits Xi(X, i, k);
+      KaratsubaChunk(T, Xi, Y0, scratch);
+      if (should_terminate()) return;
+      AddAt(Z + i, T);
+      if (Y1.len() > 0) {
+        KaratsubaChunk(T, Xi, Y1, scratch);
+        if (should_terminate()) return;
+        AddAt(Z + (i + k), T);
+      }
+    }
+  }
+}
+
+// Entry point for chunk-wise multiplications, selects an appropriate
+// algorithm for the inputs based on their sizes.
+void ProcessorImpl::KaratsubaChunk(RWDigits Z, Digits X, Digits Y,
+                                   RWDigits scratch) {
+  X.Normalize();
+  Y.Normalize();
+  if (X.len() == 0 || Y.len() == 0) return Z.Clear();
+  if (X.len() < Y.len()) std::swap(X, Y);
+  if (Y.len() == 1) return MultiplySingle(Z, X, Y[0]);
+  if (Y.len() < kKaratsubaThreshold) return MultiplySchoolbook(Z, X, Y);
+  int k = KaratsubaLength(Y.len());
+  DCHECK(scratch.len() >= 4 * k);
+  return KaratsubaStart(Z, X, Y, scratch, k);
+}
+
+// The main recursive Karatsuba method.
+void ProcessorImpl::KaratsubaMain(RWDigits Z, Digits X, Digits Y,
+                                  RWDigits scratch, int n) {
+  if (n < kKaratsubaThreshold) {
+    X.Normalize();
+    Y.Normalize();
+    if (X.len() >= Y.len()) {
+      return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), X, Y);
+    } else {
+      return MultiplySchoolbook(RWDigits(Z, 0, 2 * n), Y, X);
+    }
+  }
+  DCHECK(scratch.len() >= 4 * n);
+  DCHECK((n & 1) == 0);  // NOLINT(readability/check)
+  int n2 = n >> 1;
+  Digits X0(X, 0, n2);
+  Digits X1(X, n2, n2);
+  Digits Y0(Y, 0, n2);
+  Digits Y1(Y, n2, n2);
+  RWDigits scratch_for_recursion(scratch, 2 * n, 2 * n);
+  RWDigits P0(scratch, 0, n);
+  KaratsubaMain(P0, X0, Y0, scratch_for_recursion, n2);
+  if (should_terminate()) return;
+  for (int i = 0; i < n; i++) Z[i] = P0[i];
+  RWDigits P2(scratch, n, n);
+  KaratsubaMain(P2, X1, Y1, scratch_for_recursion, n2);
+  if (should_terminate()) return;
+  RWDigits Z1 = Z + n;
+  int end = std::min(Z1.len(), P2.len());
+  for (int i = 0; i < end; i++) Z1[i] = P2[i];
+  for (int i = end; i < n; i++) {
+    DCHECK(P2[i] == 0);  // NOLINT(readability/check)
+  }
+  AddAt(Z + n2, P0);
+  AddAt(Z + n2, P2);
+  RWDigits X_diff(scratch, 0, n2);
+  RWDigits Y_diff(scratch, n2, n2);
+  int sign = 1;
+  KaratsubaSubtractionHelper(X_diff, X1, X0, &sign);
+  KaratsubaSubtractionHelper(Y_diff, Y0, Y1, &sign);
+  RWDigits P1(scratch, n, n);
+  KaratsubaMain(P1, X_diff, Y_diff, scratch_for_recursion, n2);
+  if (sign > 0) {
+    AddAt(Z + n2, P1);
+  } else {
+    SubAt(Z + n2, P1);
+  }
+}
+
+}  // namespace bigint
+}  // namespace v8
diff --git a/src/bigint/mul-schoolbook.cc b/src/bigint/mul-schoolbook.cc
@@ -71,7 +71,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) {
     next_carry = 0;
     BODY(0, i);
     AddWorkEstimate(i);
-    if (should_terminate()) return;
   }
   // Last part: i exceeds Y now, we have to be careful about bounds.
   int loop_end = X.len() + Y.len() - 2;
@@ -85,7 +84,6 @@ void ProcessorImpl::MultiplySchoolbook(RWDigits Z, Digits X, Digits Y) {
     next_carry = 0;
     BODY(min_x_index, max_x_index);
     AddWorkEstimate(max_x_index - min_x_index);
-    if (should_terminate()) return;
   }
   // Write the last digit, and zero out any extra space in Z.
   Z[i++] = digit_add2(next, carry, &carry);

diff --git a/src/bigint/util.h b/src/bigint/util.h
@@ -0,0 +1,60 @@
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// "Generic" helper functions (not specific to BigInts).
+
+#include <stdint.h>
+
+#include <type_traits>
+
+#ifdef _MSC_VER
+#include <intrin.h>  // For _BitScanReverse.
+#endif
+
+#ifndef V8_BIGINT_UTIL_H_
+#define V8_BIGINT_UTIL_H_
+
+// Integer division, rounding up.
+#define DIV_CEIL(x, y) (((x)-1) / (y) + 1)
+
+namespace v8 {
+namespace bigint {
+
+// Rounds up x to a multiple of y.
+inline constexpr int RoundUp(int x, int y) { return (x + y - 1) & -y; }
+
+// Different environments disagree on how 64-bit uintptr_t and uint64_t are
+// defined, so we have to use templates to be generic.
+template <typename T, typename = typename std::enable_if<
+                          std::is_unsigned<T>::value && sizeof(T) == 8>::type>
+constexpr int CountLeadingZeros(T value) {
+#if __GNUC__ || __clang__
+  return value == 0 ? 64 : __builtin_clzll(value);
+#elif _MSC_VER
+  unsigned long index = 0;  // NOLINT(runtime/int). MSVC insists.
+  return _BitScanReverse64(&index, value) ? 63 - index : 64;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+constexpr int CountLeadingZeros(uint32_t value) {
+#if __GNUC__ || __clang__
+  return value == 0 ? 32 : __builtin_clz(value);
+#elif _MSC_VER
+  unsigned long index = 0;  // NOLINT(runtime/int). MSVC insists.
+  return _BitScanReverse(&index, value) ? 31 - index : 32;
+#else
+#error Unsupported compiler.
+#endif
+}
+
+inline constexpr int BitLength(int n) {
+  return 32 - CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+}  // namespace bigint
+}  // namespace v8
+
+#endif  // V8_BIGINT_UTIL_H_