use NAF for mul_add()

tomato42 · tomato42 · commit 015d158b412f · 2020-11-25T23:59:18.000+01:00
while we were using the more clever algorithm, with addition of two
points at the time when possible, it's possible to do it slightly
faster by performing something similar but with 2-ary NAF

this speeds up single-shot verify by about 5% with python int() and by
about 4% with gmpy's mpz()
diff --git a/README.md b/README.md
@@ -72,32 +72,35 @@ pip install ecdsa[gmpy]
 
 The following table shows how long this library takes to generate keypairs
 (`keygen`), to sign data (`sign`), to verify those signatures (`verify`),
-and to derive a shared secret (`ecdh`).
+to derive a shared secret (`ecdh`), and
+to verify the signatures with no key specific precomputation (`no PC verify`).
 All those values are in seconds.
 For convenience, the inverses of those values are also provided:
 how many keys per second can be generated (`keygen/s`), how many signatures
 can be made per second (`sign/s`), how many signatures can be verified
-per second (`verify/s`), and how many shared secrets can be derived per second
-(`ecdh/s`). The size in bytes of a raw signature (generally the smallest
+per second (`verify/s`), how many shared secrets can be derived per second
+(`ecdh/s`), and how many signatures with no key specific
+precomputation can be verified per second (`no PC verify/s`). The size of raw
+signature (generally the smallest
 way a signature can be encoded) is also provided in the `siglen` column.
 Use `tox -e speed` to generate this table on your own computer.
 On an Intel Core i7 4790K @ 4.0GHz I'm getting the following performance:
 
 ```
-                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s
-        NIST192p:     48   0.00035s   2893.02   0.00038s   2620.53   0.00069s   1458.92
-        NIST224p:     56   0.00043s   2307.11   0.00048s   2092.00   0.00088s   1131.33
-        NIST256p:     64   0.00056s   1793.70   0.00061s   1639.87   0.00113s    883.79
-        NIST384p:     96   0.00116s    864.33   0.00124s    806.29   0.00233s    429.87
-        NIST521p:    132   0.00221s    452.16   0.00234s    427.31   0.00460s    217.19
-       SECP256k1:     64   0.00056s   1772.65   0.00061s   1628.73   0.00110s    912.13
- BRAINPOOLP160r1:     40   0.00026s   3801.86   0.00029s   3401.11   0.00052s   1930.47
- BRAINPOOLP192r1:     48   0.00034s   2925.73   0.00038s   2634.34   0.00070s   1438.06
- BRAINPOOLP224r1:     56   0.00044s   2287.98   0.00048s   2083.87   0.00088s   1137.52
- BRAINPOOLP256r1:     64   0.00056s   1774.11   0.00061s   1628.25   0.00112s    890.71
- BRAINPOOLP320r1:     80   0.00081s   1238.18   0.00087s   1146.71   0.00151s    661.95
- BRAINPOOLP384r1:     96   0.00117s    855.47   0.00124s    804.56   0.00241s    414.83
- BRAINPOOLP512r1:    128   0.00223s    447.99   0.00234s    427.49   0.00437s    229.09
+                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s  no PC verify  no PC verify/s
+        NIST192p:     48   0.00033s   2991.13   0.00036s   2740.86   0.00067s   1502.11       0.00136s         737.54
+        NIST224p:     56   0.00042s   2360.67   0.00046s   2190.16   0.00083s   1201.83       0.00170s         587.79
+        NIST256p:     64   0.00053s   1872.02   0.00057s   1743.08   0.00103s    968.53       0.00219s         457.36
+        NIST384p:     96   0.00110s    907.45   0.00116s    861.63   0.00218s    459.38       0.00445s         224.92
+        NIST521p:    132   0.00214s    467.72   0.00223s    448.70   0.00430s    232.76       0.00888s         112.66
+       SECP256k1:     64   0.00054s   1841.11   0.00058s   1722.33   0.00111s    903.07       0.00216s         464.01
+ BRAINPOOLP160r1:     40   0.00026s   3780.81   0.00029s   3422.67   0.00054s   1863.09       0.00109s         914.93
+ BRAINPOOLP192r1:     48   0.00034s   2942.79   0.00037s   2710.56   0.00070s   1435.59       0.00138s         724.79
+ BRAINPOOLP224r1:     56   0.00044s   2278.35   0.00047s   2145.32   0.00090s   1115.34       0.00182s         549.72
+ BRAINPOOLP256r1:     64   0.00055s   1832.95   0.00059s   1704.50   0.00110s    911.02       0.00234s         427.22
+ BRAINPOOLP320r1:     80   0.00077s   1305.78   0.00082s   1222.47   0.00156s    640.27       0.00321s         311.56
+ BRAINPOOLP384r1:     96   0.00112s    893.07   0.00118s    849.32   0.00228s    438.75       0.00478s         209.35
+ BRAINPOOLP512r1:    128   0.00213s    470.08   0.00221s    451.98   0.00419s    238.70       0.00940s         106.44
 
                        ecdh     ecdh/s
         NIST192p:   0.00110s    910.70
@@ -118,20 +121,20 @@ On an Intel Core i7 4790K @ 4.0GHz I'm getting the following performance:
 To test performance with `gmpy2` loaded, use `tox -e speedgmpy2`.
 On the same machine I'm getting the following performance with `gmpy2`:
 ```
-                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s
-        NIST192p:     48   0.00017s   5945.50   0.00018s   5544.66   0.00033s   3002.54
-        NIST224p:     56   0.00021s   4742.14   0.00022s   4463.52   0.00044s   2248.59
-        NIST256p:     64   0.00024s   4155.73   0.00025s   3994.28   0.00047s   2105.34
-        NIST384p:     96   0.00041s   2415.06   0.00043s   2316.41   0.00085s   1177.18
-        NIST521p:    132   0.00072s   1391.14   0.00074s   1359.63   0.00140s    716.31
-       SECP256k1:     64   0.00024s   4216.50   0.00025s   3994.52   0.00047s   2120.57
- BRAINPOOLP160r1:     40   0.00014s   7038.99   0.00015s   6501.55   0.00029s   3397.79
- BRAINPOOLP192r1:     48   0.00017s   5983.18   0.00018s   5626.08   0.00035s   2843.62
- BRAINPOOLP224r1:     56   0.00021s   4727.54   0.00022s   4464.86   0.00043s   2326.84
- BRAINPOOLP256r1:     64   0.00024s   4221.00   0.00025s   4010.26   0.00049s   2046.40
- BRAINPOOLP320r1:     80   0.00032s   3142.14   0.00033s   3009.15   0.00061s   1652.88
- BRAINPOOLP384r1:     96   0.00041s   2415.98   0.00043s   2340.35   0.00083s   1198.77
- BRAINPOOLP512r1:    128   0.00064s   1567.27   0.00066s   1526.33   0.00127s    788.51
+                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s  no PC verify  no PC verify/s
+        NIST192p:     48   0.00017s   5878.39   0.00018s   5670.66   0.00034s   2971.38       0.00067s        1484.97
+        NIST224p:     56   0.00021s   4705.08   0.00022s   4587.19   0.00040s   2499.96       0.00088s        1140.97
+        NIST256p:     64   0.00024s   4252.73   0.00024s   4108.48   0.00049s   2038.80       0.00096s        1043.03
+        NIST384p:     96   0.00041s   2455.84   0.00042s   2406.31   0.00079s   1260.03       0.00172s         580.61
+        NIST521p:    132   0.00070s   1419.16   0.00072s   1392.50   0.00139s    719.35       0.00307s         325.96
+       SECP256k1:     64   0.00024s   4228.87   0.00024s   4086.32   0.00047s   2124.86       0.00096s        1037.53
+ BRAINPOOLP160r1:     40   0.00014s   6932.12   0.00015s   6678.36   0.00030s   3387.90       0.00056s        1784.02
+ BRAINPOOLP192r1:     48   0.00017s   5886.05   0.00017s   5720.63   0.00034s   2941.22       0.00067s        1490.87
+ BRAINPOOLP224r1:     56   0.00021s   4748.89   0.00022s   4638.15   0.00041s   2460.86       0.00089s        1128.91
+ BRAINPOOLP256r1:     64   0.00024s   4248.00   0.00024s   4135.19   0.00045s   2209.69       0.00099s        1006.45
+ BRAINPOOLP320r1:     80   0.00032s   3096.85   0.00033s   3012.43   0.00065s   1547.07       0.00137s         728.60
+ BRAINPOOLP384r1:     96   0.00041s   2436.12   0.00042s   2396.23   0.00083s   1211.13       0.00176s         568.39
+ BRAINPOOLP512r1:    128   0.00063s   1580.09   0.00064s   1562.78   0.00129s    778.09       0.00279s         358.12
 
                        ecdh     ecdh/s
         NIST192p:   0.00051s   1960.26
diff --git a/src/ecdsa/ellipticcurve.py b/src/ecdsa/ellipticcurve.py
@@ -39,7 +39,7 @@
     from gmpy2 import mpz
 
     GMPY = True
-except ImportError:
+except ImportError:  # pragma: no branch
     try:
         from gmpy import mpz
 
@@ -57,7 +57,7 @@
 class CurveFp(object):
     """Elliptic Curve over the field of integers modulo a prime."""
 
-    if GMPY:
+    if GMPY:  # pragma: no branch
 
         def __init__(self, p, a, b, h=None):
             """
@@ -75,7 +75,7 @@ def __init__(self, p, a, b, h=None):
             # gmpy with it
             self.__h = h
 
-    else:
+    else:  # pragma: no branch
 
         def __init__(self, p, a, b, h=None):
             """
@@ -164,12 +164,12 @@ def __init__(self, curve, x, y, z, order=None, generator=False):
         # since it's generally better (faster) to use scaled points vs unscaled
         # ones, use writer-biased RWLock for locking:
         self._update_lock = RWLock()
-        if GMPY:
+        if GMPY:  # pragma: no branch
             self.__x = mpz(x)
             self.__y = mpz(y)
             self.__z = mpz(z)
             self.__order = order and mpz(order)
-        else:
+        else:  # pragma: no branch
             self.__x = x
             self.__y = y
             self.__z = z
@@ -359,7 +359,8 @@ def from_affine(point, generator=False):
             point.curve(), point.x(), point.y(), 1, point.order(), generator
         )
 
-    # plese note that all the methods that use the equations from hyperelliptic
+    # please note that all the methods that use the equations from
+    # hyperelliptic
     # are formatted in a way to maximise performance.
     # Things that make code faster: multiplying instead of taking to the power
     # (`xx = x * x; xxxx = xx * xx % p` is faster than `xxxx = x**4 % p` and
@@ -389,7 +390,7 @@ def _double(self, X1, Y1, Z1, p, a):
         """Add a point to itself, arbitrary z."""
         if Z1 == 1:
             return self._double_with_z_1(X1, Y1, p, a)
-        if not Z1:
+        if not Y1 or not Z1:
             return 0, 0, 1
         # after:
         # http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
@@ -579,11 +580,11 @@ def _naf(mult):
             if mult % 2:
                 nd = mult % 4
                 if nd >= 2:
-                    nd = nd - 4
-                ret += [nd]
+                    nd -= 4
+                ret.append(nd)
                 mult -= nd
             else:
-                ret += [0]
+                ret.append(0)
             mult //= 2
         return ret
 
@@ -621,15 +622,6 @@ def __mul__(self, other):
 
         return PointJacobi(self.__curve, X3, Y3, Z3, self.__order)
 
-    @staticmethod
-    def _leftmost_bit(x):
-        """Return integer with the same magnitude as x but only one bit set"""
-        assert x > 0
-        result = 1
-        while result <= x:
-            result = 2 * result
-        return result // 2
-
     def mul_add(self, self_mul, other, other_mul):
         """
         Do two multiplications at the same time, add results.
@@ -643,7 +635,7 @@ def mul_add(self, self_mul, other, other_mul):
         if not isinstance(other, PointJacobi):
             other = PointJacobi.from_affine(other)
         # when the points have precomputed answers, then multiplying them alone
-        # is faster (as it uses NAF)
+        # is faster (as it uses NAF and no point doublings)
         self._maybe_precompute()
         other._maybe_precompute()
         if self.__precompute and other.__precompute:
@@ -653,32 +645,76 @@ def mul_add(self, self_mul, other, other_mul):
             self_mul = self_mul % self.__order
             other_mul = other_mul % self.__order
 
-        i = self._leftmost_bit(max(self_mul, other_mul)) * 2
+        # (X3, Y3, Z3) is the accumulator
         X3, Y3, Z3 = 0, 0, 1
         p, a = self.__curve.p(), self.__curve.a()
-        self = self.scale()
-        # after scaling, point is immutable, no need for locking
-        X1, Y1 = self.__x, self.__y
-        other = other.scale()
-        X2, Y2 = other.__x, other.__y
-        both = self + other
-        if both is INFINITY:
-            X4, Y4 = 0, 0
-        else:
-            both.scale()
-            X4, Y4 = both.__x, both.__y
+
+        # as we have 6 unique points to work with, we can't scale all of them,
+        # but do scale the ones that are used most often
+        # (post scale() points are immutable so no need for locking)
+        self.scale()
+        X1, Y1, Z1 = self.__x, self.__y, self.__z
+        other.scale()
+        X2, Y2, Z2 = other.__x, other.__y, other.__z
+
         _double = self._double
         _add = self._add
-        while i > 1:
+
+        # with NAF we have 3 options: no add, subtract, add
+        # so with 2 points, we have 9 combinations:
+        # 0, -A, +A, -B, -A-B, +A-B, +B, -A+B, +A+B
+        # so we need 4 combined points:
+        mAmB_X, mAmB_Y, mAmB_Z = _add(X1, -Y1, Z1, X2, -Y2, Z2, p)
+        pAmB_X, pAmB_Y, pAmB_Z = _add(X1, Y1, Z1, X2, -Y2, Z2, p)
+        mApB_X, mApB_Y, mApB_Z = _add(X1, -Y1, Z1, X2, Y2, Z2, p)
+        pApB_X, pApB_Y, pApB_Z = _add(X1, Y1, Z1, X2, Y2, Z2, p)
+        # when the self and other sum to infinity, we need to add them
+        # one by one to get correct result but as that's very unlikely to
+        # happen in regular operation, we don't need to optimise this case
+        if not pApB_Y or not pApB_Z:
+            return self * self_mul + other * other_mul
+
+        # gmp object creation has cumulatively higher overhead than the
+        # speedup we get from calculating the NAF using gmp so ensure use
+        # of int()
+        self_naf = list(reversed(self._naf(int(self_mul))))
+        other_naf = list(reversed(self._naf(int(other_mul))))
+        # ensure that the lists are the same length (zip() will truncate
+        # longer one otherwise)
+        if len(self_naf) < len(other_naf):
+            self_naf = [0] * (len(other_naf)-len(self_naf)) + self_naf
+        elif len(self_naf) > len(other_naf):
+            other_naf = [0] * (len(self_naf)-len(other_naf)) + other_naf
+
+        for A, B in zip(self_naf, other_naf):
             X3, Y3, Z3 = _double(X3, Y3, Z3, p, a)
-            i = i // 2
 
-            if self_mul & i and other_mul & i:
-                X3, Y3, Z3 = _add(X3, Y3, Z3, X4, Y4, 1, p)
-            elif self_mul & i:
-                X3, Y3, Z3 = _add(X3, Y3, Z3, X1, Y1, 1, p)
-            elif other_mul & i:
-                X3, Y3, Z3 = _add(X3, Y3, Z3, X2, Y2, 1, p)
+            # conditions ordered from most to least likely
+            if A == 0:
+                if B == 0:
+                    pass
+                elif B < 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X2, -Y2, Z2, p)
+                else:
+                    assert B > 0
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X2, Y2, Z2, p)
+            elif A < 0:
+                if B == 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X1, -Y1, Z1, p)
+                elif B < 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, mAmB_X, mAmB_Y, mAmB_Z, p)
+                else:
+                    assert B > 0
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, mApB_X, mApB_Y, mApB_Z, p)
+            else:
+                assert A > 0
+                if B == 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X1, Y1, Z1, p)
+                elif B < 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, pAmB_X, pAmB_Y, pAmB_Z, p)
+                else:
+                    assert B > 0
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, pApB_X, pApB_Y, pApB_Z, p)
 
         if not Y3 or not Z3:
             return INFINITY
diff --git a/src/ecdsa/test_jacobi.py b/src/ecdsa/test_jacobi.py
@@ -367,6 +367,11 @@ def test_add_point_3_times(self):
 
         self.assertEqual(j_g * 3, j_g + j_g + j_g)
 
+    def test_mul_without_order(self):
+        j_g = PointJacobi(curve_256, generator_256.x(), generator_256.y(), 1)
+
+        self.assertEqual(j_g * generator_256.order(), INFINITY)
+
     def test_mul_add_inf(self):
         j_g = PointJacobi.from_affine(generator_256)
 
@@ -405,6 +410,21 @@ def test_mul_add_to_mul(self):
 
         self.assertEqual(a, b)
 
+    def test_mul_add_differnt(self):
+        j_g = PointJacobi.from_affine(generator_256)
+
+        w_a = j_g * 2
+
+        self.assertEqual(j_g.mul_add(1, w_a, 1), j_g * 3)
+
+    def test_mul_add_slightly_different(self):
+        j_g = PointJacobi.from_affine(generator_256)
+
+        w_a = j_g * 2
+        w_b = j_g * 3
+
+        self.assertEqual(w_a.mul_add(1, w_b, 3), w_a * 1 + w_b * 3)
+
     def test_mul_add(self):
         j_g = PointJacobi.from_affine(generator_256)
 
@@ -428,11 +448,54 @@ def test_mul_add_large(self):
             j_g * (0xFF00 + 255 * 0xF0F0), j_g.mul_add(0xFF00, b, 0xF0F0)
         )
 
+    def test_mul_add_with_infinity_as_result(self):
+        j_g = PointJacobi.from_affine(generator_256)
+
+        order = generator_256.order()
+
+        b = PointJacobi.from_affine(generator_256 * 256)
+
+        self.assertEqual(j_g.mul_add(order % 256, b, order // 256),
+                         INFINITY)
+
+    def test_mul_add_without_order(self):
+        j_g = PointJacobi(curve_256, generator_256.x(), generator_256.y(), 1)
+
+        order = generator_256.order()
+
+        w_b = generator_256 * 34
+        w_b.scale()
+
+        b = PointJacobi(curve_256, w_b.x(), w_b.y(), 1)
+
+        self.assertEqual(j_g.mul_add(order % 34, b, order // 34),
+                         INFINITY)
+
+    def test_mul_add_with_doubled_negation_of_itself(self):
+        j_g = PointJacobi.from_affine(generator_256 * 17)
+
+        order = generator_256.order()
+
+        dbl_neg = 2 * (-j_g)
+
+        self.assertEqual(j_g.mul_add(4, dbl_neg, 2), INFINITY)
+
     def test_equality(self):
         pj1 = PointJacobi(curve=CurveFp(23, 1, 1, 1), x=2, y=3, z=1, order=1)
         pj2 = PointJacobi(curve=CurveFp(23, 1, 1, 1), x=2, y=3, z=1, order=1)
         self.assertEqual(pj1, pj2)
 
+    def test_equality_with_invalid_object(self):
+        j_g = PointJacobi.from_affine(generator_256)
+
+        self.assertNotEqual(j_g, 12)
+
+    def test_equality_with_wrong_curves(self):
+        p_a = PointJacobi.from_affine(generator_256)
+        p_b = PointJacobi.from_affine(generator_224)
+
+        self.assertNotEqual(p_a, p_b)
+
     def test_pickle(self):
         pj = PointJacobi(curve=CurveFp(23, 1, 1, 1), x=2, y=3, z=1, order=1)
         self.assertEqual(pickle.loads(pickle.dumps(pj)), pj)
diff --git a/tox.ini b/tox.ini
@@ -60,7 +60,7 @@ sitepackages=True
 whitelist_externals=coverage
 commands =
          coverage run --branch -m pytest --hypothesis-show-statistics {posargs:src/ecdsa}
-         coverage xml
+         coverage html
          coverage report -m
 
 [testenv:speed]